diff options
Diffstat (limited to 'fs/btrfs')
46 files changed, 5421 insertions, 1518 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ccd25ba7a9ac..9a8622a5b867 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig | |||
| @@ -5,6 +5,9 @@ config BTRFS_FS | |||
| 5 | select ZLIB_DEFLATE | 5 | select ZLIB_DEFLATE |
| 6 | select LZO_COMPRESS | 6 | select LZO_COMPRESS |
| 7 | select LZO_DECOMPRESS | 7 | select LZO_DECOMPRESS |
| 8 | select RAID6_PQ | ||
| 9 | select XOR_BLOCKS | ||
| 10 | |||
| 8 | help | 11 | help |
| 9 | Btrfs is a new filesystem with extents, writable snapshotting, | 12 | Btrfs is a new filesystem with extents, writable snapshotting, |
| 10 | support for multiple devices and many more features. | 13 | support for multiple devices and many more features. |
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 7df3e0f0ee51..3932224f99e9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile | |||
| @@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ | |||
| 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ | 8 | extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ |
| 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ | 9 | export.o tree-log.o free-space-cache.o zlib.o lzo.o \ |
| 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ | 10 | compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ |
| 11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o | 11 | reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o |
| 12 | 12 | ||
| 13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o | 13 | btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o |
| 14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o | 14 | btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o |
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04edf69be875..bd605c87adfd 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c | |||
| @@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, | |||
| 352 | err = __resolve_indirect_ref(fs_info, search_commit_root, | 352 | err = __resolve_indirect_ref(fs_info, search_commit_root, |
| 353 | time_seq, ref, parents, | 353 | time_seq, ref, parents, |
| 354 | extent_item_pos); | 354 | extent_item_pos); |
| 355 | if (err) { | 355 | if (err) |
| 356 | if (ret == 0) | ||
| 357 | ret = err; | ||
| 358 | continue; | 356 | continue; |
| 359 | } | ||
| 360 | 357 | ||
| 361 | /* we put the first parent into the ref at hand */ | 358 | /* we put the first parent into the ref at hand */ |
| 362 | ULIST_ITER_INIT(&uiter); | 359 | ULIST_ITER_INIT(&uiter); |
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index d61feca79455..310a7f6d09b1 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h | |||
| @@ -19,7 +19,7 @@ | |||
| 19 | #ifndef __BTRFS_BACKREF__ | 19 | #ifndef __BTRFS_BACKREF__ |
| 20 | #define __BTRFS_BACKREF__ | 20 | #define __BTRFS_BACKREF__ |
| 21 | 21 | ||
| 22 | #include "ioctl.h" | 22 | #include <linux/btrfs.h> |
| 23 | #include "ulist.h" | 23 | #include "ulist.h" |
| 24 | #include "extent_io.h" | 24 | #include "extent_io.h" |
| 25 | 25 | ||
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 2a8c242bc4f5..d9b97d4960e6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h | |||
| @@ -40,6 +40,8 @@ | |||
| 40 | #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 | 40 | #define BTRFS_INODE_HAS_ASYNC_EXTENT 6 |
| 41 | #define BTRFS_INODE_NEEDS_FULL_SYNC 7 | 41 | #define BTRFS_INODE_NEEDS_FULL_SYNC 7 |
| 42 | #define BTRFS_INODE_COPY_EVERYTHING 8 | 42 | #define BTRFS_INODE_COPY_EVERYTHING 8 |
| 43 | #define BTRFS_INODE_IN_DELALLOC_LIST 9 | ||
| 44 | #define BTRFS_INODE_READDIO_NEED_LOCK 10 | ||
| 43 | 45 | ||
| 44 | /* in memory btrfs inode */ | 46 | /* in memory btrfs inode */ |
| 45 | struct btrfs_inode { | 47 | struct btrfs_inode { |
| @@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) | |||
| 216 | return 0; | 218 | return 0; |
| 217 | } | 219 | } |
| 218 | 220 | ||
| 221 | /* | ||
| 222 | * Disable DIO read nolock optimization, so new dio readers will be forced | ||
| 223 | * to grab i_mutex. It is used to avoid the endless truncate due to | ||
| 224 | * nonlocked dio read. | ||
| 225 | */ | ||
| 226 | static inline void btrfs_inode_block_unlocked_dio(struct inode *inode) | ||
| 227 | { | ||
| 228 | set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags); | ||
| 229 | smp_mb(); | ||
| 230 | } | ||
| 231 | |||
| 232 | static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode) | ||
| 233 | { | ||
| 234 | smp_mb__before_clear_bit(); | ||
| 235 | clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, | ||
| 236 | &BTRFS_I(inode)->runtime_flags); | ||
| 237 | } | ||
| 238 | |||
| 219 | #endif | 239 | #endif |
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 11d47bfb62b4..18af6f48781a 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c | |||
| @@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror( | |||
| 813 | (bh->b_data + (dev_bytenr & 4095)); | 813 | (bh->b_data + (dev_bytenr & 4095)); |
| 814 | 814 | ||
| 815 | if (btrfs_super_bytenr(super_tmp) != dev_bytenr || | 815 | if (btrfs_super_bytenr(super_tmp) != dev_bytenr || |
| 816 | strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC, | 816 | super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) || |
| 817 | sizeof(super_tmp->magic)) || | ||
| 818 | memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || | 817 | memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || |
| 819 | btrfs_super_nodesize(super_tmp) != state->metablock_size || | 818 | btrfs_super_nodesize(super_tmp) != state->metablock_size || |
| 820 | btrfs_super_leafsize(super_tmp) != state->metablock_size || | 819 | btrfs_super_leafsize(super_tmp) != state->metablock_size || |
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 94ab2f80e7e3..15b94089abc4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c | |||
| @@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, | |||
| 372 | page = compressed_pages[pg_index]; | 372 | page = compressed_pages[pg_index]; |
| 373 | page->mapping = inode->i_mapping; | 373 | page->mapping = inode->i_mapping; |
| 374 | if (bio->bi_size) | 374 | if (bio->bi_size) |
| 375 | ret = io_tree->ops->merge_bio_hook(page, 0, | 375 | ret = io_tree->ops->merge_bio_hook(WRITE, page, 0, |
| 376 | PAGE_CACHE_SIZE, | 376 | PAGE_CACHE_SIZE, |
| 377 | bio, 0); | 377 | bio, 0); |
| 378 | else | 378 | else |
| @@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, | |||
| 655 | page->index = em_start >> PAGE_CACHE_SHIFT; | 655 | page->index = em_start >> PAGE_CACHE_SHIFT; |
| 656 | 656 | ||
| 657 | if (comp_bio->bi_size) | 657 | if (comp_bio->bi_size) |
| 658 | ret = tree->ops->merge_bio_hook(page, 0, | 658 | ret = tree->ops->merge_bio_hook(READ, page, 0, |
| 659 | PAGE_CACHE_SIZE, | 659 | PAGE_CACHE_SIZE, |
| 660 | comp_bio, 0); | 660 | comp_bio, 0); |
| 661 | else | 661 | else |
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index eea5da7a2b9a..ecd25a1b4e51 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c | |||
| @@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq, | |||
| 1138 | switch (tm->op) { | 1138 | switch (tm->op) { |
| 1139 | case MOD_LOG_KEY_REMOVE_WHILE_FREEING: | 1139 | case MOD_LOG_KEY_REMOVE_WHILE_FREEING: |
| 1140 | BUG_ON(tm->slot < n); | 1140 | BUG_ON(tm->slot < n); |
| 1141 | /* Fallthrough */ | ||
| 1141 | case MOD_LOG_KEY_REMOVE_WHILE_MOVING: | 1142 | case MOD_LOG_KEY_REMOVE_WHILE_MOVING: |
| 1142 | case MOD_LOG_KEY_REMOVE: | 1143 | case MOD_LOG_KEY_REMOVE: |
| 1143 | btrfs_set_node_key(eb, &tm->key, tm->slot); | 1144 | btrfs_set_node_key(eb, &tm->key, tm->slot); |
| @@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, | |||
| 1222 | 1223 | ||
| 1223 | __tree_mod_log_rewind(eb_rewin, time_seq, tm); | 1224 | __tree_mod_log_rewind(eb_rewin, time_seq, tm); |
| 1224 | WARN_ON(btrfs_header_nritems(eb_rewin) > | 1225 | WARN_ON(btrfs_header_nritems(eb_rewin) > |
| 1225 | BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root)); | 1226 | BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root)); |
| 1226 | 1227 | ||
| 1227 | return eb_rewin; | 1228 | return eb_rewin; |
| 1228 | } | 1229 | } |
| @@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) | |||
| 1441 | */ | 1442 | */ |
| 1442 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, | 1443 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, |
| 1443 | struct btrfs_root *root, struct extent_buffer *parent, | 1444 | struct btrfs_root *root, struct extent_buffer *parent, |
| 1444 | int start_slot, int cache_only, u64 *last_ret, | 1445 | int start_slot, u64 *last_ret, |
| 1445 | struct btrfs_key *progress) | 1446 | struct btrfs_key *progress) |
| 1446 | { | 1447 | { |
| 1447 | struct extent_buffer *cur; | 1448 | struct extent_buffer *cur; |
| @@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, | |||
| 1461 | struct btrfs_disk_key disk_key; | 1462 | struct btrfs_disk_key disk_key; |
| 1462 | 1463 | ||
| 1463 | parent_level = btrfs_header_level(parent); | 1464 | parent_level = btrfs_header_level(parent); |
| 1464 | if (cache_only && parent_level != 1) | ||
| 1465 | return 0; | ||
| 1466 | 1465 | ||
| 1467 | WARN_ON(trans->transaction != root->fs_info->running_transaction); | 1466 | WARN_ON(trans->transaction != root->fs_info->running_transaction); |
| 1468 | WARN_ON(trans->transid != root->fs_info->generation); | 1467 | WARN_ON(trans->transid != root->fs_info->generation); |
| @@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, | |||
| 1508 | else | 1507 | else |
| 1509 | uptodate = 0; | 1508 | uptodate = 0; |
| 1510 | if (!cur || !uptodate) { | 1509 | if (!cur || !uptodate) { |
| 1511 | if (cache_only) { | ||
| 1512 | free_extent_buffer(cur); | ||
| 1513 | continue; | ||
| 1514 | } | ||
| 1515 | if (!cur) { | 1510 | if (!cur) { |
| 1516 | cur = read_tree_block(root, blocknr, | 1511 | cur = read_tree_block(root, blocknr, |
| 1517 | blocksize, gen); | 1512 | blocksize, gen); |
| @@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
| 4825 | 4820 | ||
| 4826 | /* | 4821 | /* |
| 4827 | * A helper function to walk down the tree starting at min_key, and looking | 4822 | * A helper function to walk down the tree starting at min_key, and looking |
| 4828 | * for nodes or leaves that are either in cache or have a minimum | 4823 | * for nodes or leaves that are have a minimum transaction id. |
| 4829 | * transaction id. This is used by the btree defrag code, and tree logging | 4824 | * This is used by the btree defrag code, and tree logging |
| 4830 | * | 4825 | * |
| 4831 | * This does not cow, but it does stuff the starting key it finds back | 4826 | * This does not cow, but it does stuff the starting key it finds back |
| 4832 | * into min_key, so you can call btrfs_search_slot with cow=1 on the | 4827 | * into min_key, so you can call btrfs_search_slot with cow=1 on the |
| @@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) | |||
| 4847 | */ | 4842 | */ |
| 4848 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, | 4843 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, |
| 4849 | struct btrfs_key *max_key, | 4844 | struct btrfs_key *max_key, |
| 4850 | struct btrfs_path *path, int cache_only, | 4845 | struct btrfs_path *path, |
| 4851 | u64 min_trans) | 4846 | u64 min_trans) |
| 4852 | { | 4847 | { |
| 4853 | struct extent_buffer *cur; | 4848 | struct extent_buffer *cur; |
| @@ -4887,15 +4882,12 @@ again: | |||
| 4887 | if (sret && slot > 0) | 4882 | if (sret && slot > 0) |
| 4888 | slot--; | 4883 | slot--; |
| 4889 | /* | 4884 | /* |
| 4890 | * check this node pointer against the cache_only and | 4885 | * check this node pointer against the min_trans parameters. |
| 4891 | * min_trans parameters. If it isn't in cache or is too | 4886 | * If it is too old, old, skip to the next one. |
| 4892 | * old, skip to the next one. | ||
| 4893 | */ | 4887 | */ |
| 4894 | while (slot < nritems) { | 4888 | while (slot < nritems) { |
| 4895 | u64 blockptr; | 4889 | u64 blockptr; |
| 4896 | u64 gen; | 4890 | u64 gen; |
| 4897 | struct extent_buffer *tmp; | ||
| 4898 | struct btrfs_disk_key disk_key; | ||
| 4899 | 4891 | ||
| 4900 | blockptr = btrfs_node_blockptr(cur, slot); | 4892 | blockptr = btrfs_node_blockptr(cur, slot); |
| 4901 | gen = btrfs_node_ptr_generation(cur, slot); | 4893 | gen = btrfs_node_ptr_generation(cur, slot); |
| @@ -4903,27 +4895,7 @@ again: | |||
| 4903 | slot++; | 4895 | slot++; |
| 4904 | continue; | 4896 | continue; |
| 4905 | } | 4897 | } |
| 4906 | if (!cache_only) | 4898 | break; |
| 4907 | break; | ||
| 4908 | |||
| 4909 | if (max_key) { | ||
| 4910 | btrfs_node_key(cur, &disk_key, slot); | ||
| 4911 | if (comp_keys(&disk_key, max_key) >= 0) { | ||
| 4912 | ret = 1; | ||
| 4913 | goto out; | ||
| 4914 | } | ||
| 4915 | } | ||
| 4916 | |||
| 4917 | tmp = btrfs_find_tree_block(root, blockptr, | ||
| 4918 | btrfs_level_size(root, level - 1)); | ||
| 4919 | |||
| 4920 | if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) { | ||
| 4921 | free_extent_buffer(tmp); | ||
| 4922 | break; | ||
| 4923 | } | ||
| 4924 | if (tmp) | ||
| 4925 | free_extent_buffer(tmp); | ||
| 4926 | slot++; | ||
| 4927 | } | 4899 | } |
| 4928 | find_next_key: | 4900 | find_next_key: |
| 4929 | /* | 4901 | /* |
| @@ -4934,7 +4906,7 @@ find_next_key: | |||
| 4934 | path->slots[level] = slot; | 4906 | path->slots[level] = slot; |
| 4935 | btrfs_set_path_blocking(path); | 4907 | btrfs_set_path_blocking(path); |
| 4936 | sret = btrfs_find_next_key(root, path, min_key, level, | 4908 | sret = btrfs_find_next_key(root, path, min_key, level, |
| 4937 | cache_only, min_trans); | 4909 | min_trans); |
| 4938 | if (sret == 0) { | 4910 | if (sret == 0) { |
| 4939 | btrfs_release_path(path); | 4911 | btrfs_release_path(path); |
| 4940 | goto again; | 4912 | goto again; |
| @@ -5399,8 +5371,7 @@ out: | |||
| 5399 | /* | 5371 | /* |
| 5400 | * this is similar to btrfs_next_leaf, but does not try to preserve | 5372 | * this is similar to btrfs_next_leaf, but does not try to preserve |
| 5401 | * and fixup the path. It looks for and returns the next key in the | 5373 | * and fixup the path. It looks for and returns the next key in the |
| 5402 | * tree based on the current path and the cache_only and min_trans | 5374 | * tree based on the current path and the min_trans parameters. |
| 5403 | * parameters. | ||
| 5404 | * | 5375 | * |
| 5405 | * 0 is returned if another key is found, < 0 if there are any errors | 5376 | * 0 is returned if another key is found, < 0 if there are any errors |
| 5406 | * and 1 is returned if there are no higher keys in the tree | 5377 | * and 1 is returned if there are no higher keys in the tree |
| @@ -5409,8 +5380,7 @@ out: | |||
| 5409 | * calling this function. | 5380 | * calling this function. |
| 5410 | */ | 5381 | */ |
| 5411 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, | 5382 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, |
| 5412 | struct btrfs_key *key, int level, | 5383 | struct btrfs_key *key, int level, u64 min_trans) |
| 5413 | int cache_only, u64 min_trans) | ||
| 5414 | { | 5384 | { |
| 5415 | int slot; | 5385 | int slot; |
| 5416 | struct extent_buffer *c; | 5386 | struct extent_buffer *c; |
| @@ -5461,22 +5431,8 @@ next: | |||
| 5461 | if (level == 0) | 5431 | if (level == 0) |
| 5462 | btrfs_item_key_to_cpu(c, key, slot); | 5432 | btrfs_item_key_to_cpu(c, key, slot); |
| 5463 | else { | 5433 | else { |
| 5464 | u64 blockptr = btrfs_node_blockptr(c, slot); | ||
| 5465 | u64 gen = btrfs_node_ptr_generation(c, slot); | 5434 | u64 gen = btrfs_node_ptr_generation(c, slot); |
| 5466 | 5435 | ||
| 5467 | if (cache_only) { | ||
| 5468 | struct extent_buffer *cur; | ||
| 5469 | cur = btrfs_find_tree_block(root, blockptr, | ||
| 5470 | btrfs_level_size(root, level - 1)); | ||
| 5471 | if (!cur || | ||
| 5472 | btrfs_buffer_uptodate(cur, gen, 1) <= 0) { | ||
| 5473 | slot++; | ||
| 5474 | if (cur) | ||
| 5475 | free_extent_buffer(cur); | ||
| 5476 | goto next; | ||
| 5477 | } | ||
| 5478 | free_extent_buffer(cur); | ||
| 5479 | } | ||
| 5480 | if (gen < min_trans) { | 5436 | if (gen < min_trans) { |
| 5481 | slot++; | 5437 | slot++; |
| 5482 | goto next; | 5438 | goto next; |
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 547b7b05727f..0d82922179db 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h | |||
| @@ -31,10 +31,10 @@ | |||
| 31 | #include <trace/events/btrfs.h> | 31 | #include <trace/events/btrfs.h> |
| 32 | #include <asm/kmap_types.h> | 32 | #include <asm/kmap_types.h> |
| 33 | #include <linux/pagemap.h> | 33 | #include <linux/pagemap.h> |
| 34 | #include <linux/btrfs.h> | ||
| 34 | #include "extent_io.h" | 35 | #include "extent_io.h" |
| 35 | #include "extent_map.h" | 36 | #include "extent_map.h" |
| 36 | #include "async-thread.h" | 37 | #include "async-thread.h" |
| 37 | #include "ioctl.h" | ||
| 38 | 38 | ||
| 39 | struct btrfs_trans_handle; | 39 | struct btrfs_trans_handle; |
| 40 | struct btrfs_transaction; | 40 | struct btrfs_transaction; |
| @@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep; | |||
| 46 | extern struct kmem_cache *btrfs_free_space_cachep; | 46 | extern struct kmem_cache *btrfs_free_space_cachep; |
| 47 | struct btrfs_ordered_sum; | 47 | struct btrfs_ordered_sum; |
| 48 | 48 | ||
| 49 | #define BTRFS_MAGIC "_BHRfS_M" | 49 | #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ |
| 50 | 50 | ||
| 51 | #define BTRFS_MAX_MIRRORS 3 | 51 | #define BTRFS_MAX_MIRRORS 3 |
| 52 | 52 | ||
| @@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 }; | |||
| 191 | /* ioprio of readahead is set to idle */ | 191 | /* ioprio of readahead is set to idle */ |
| 192 | #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) | 192 | #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) |
| 193 | 193 | ||
| 194 | #define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) | ||
| 195 | |||
| 194 | /* | 196 | /* |
| 195 | * The key defines the order in the tree, and so it also defines (optimal) | 197 | * The key defines the order in the tree, and so it also defines (optimal) |
| 196 | * block layout. | 198 | * block layout. |
| @@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) | |||
| 336 | /* | 338 | /* |
| 337 | * File system states | 339 | * File system states |
| 338 | */ | 340 | */ |
| 341 | #define BTRFS_FS_STATE_ERROR 0 | ||
| 342 | #define BTRFS_FS_STATE_REMOUNTING 1 | ||
| 339 | 343 | ||
| 344 | /* Super block flags */ | ||
| 340 | /* Errors detected */ | 345 | /* Errors detected */ |
| 341 | #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) | 346 | #define BTRFS_SUPER_FLAG_ERROR (1ULL << 2) |
| 342 | 347 | ||
| @@ -502,6 +507,7 @@ struct btrfs_super_block { | |||
| 502 | #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) | 507 | #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5) |
| 503 | 508 | ||
| 504 | #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) | 509 | #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6) |
| 510 | #define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7) | ||
| 505 | 511 | ||
| 506 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL | 512 | #define BTRFS_FEATURE_COMPAT_SUPP 0ULL |
| 507 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL | 513 | #define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL |
| @@ -511,6 +517,7 @@ struct btrfs_super_block { | |||
| 511 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ | 517 | BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ |
| 512 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ | 518 | BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ |
| 513 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ | 519 | BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ |
| 520 | BTRFS_FEATURE_INCOMPAT_RAID56 | \ | ||
| 514 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) | 521 | BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) |
| 515 | 522 | ||
| 516 | /* | 523 | /* |
| @@ -952,8 +959,20 @@ struct btrfs_dev_replace_item { | |||
| 952 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) | 959 | #define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4) |
| 953 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) | 960 | #define BTRFS_BLOCK_GROUP_DUP (1ULL << 5) |
| 954 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) | 961 | #define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6) |
| 962 | #define BTRFS_BLOCK_GROUP_RAID5 (1 << 7) | ||
| 963 | #define BTRFS_BLOCK_GROUP_RAID6 (1 << 8) | ||
| 955 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE | 964 | #define BTRFS_BLOCK_GROUP_RESERVED BTRFS_AVAIL_ALLOC_BIT_SINGLE |
| 956 | #define BTRFS_NR_RAID_TYPES 5 | 965 | |
| 966 | enum btrfs_raid_types { | ||
| 967 | BTRFS_RAID_RAID10, | ||
| 968 | BTRFS_RAID_RAID1, | ||
| 969 | BTRFS_RAID_DUP, | ||
| 970 | BTRFS_RAID_RAID0, | ||
| 971 | BTRFS_RAID_SINGLE, | ||
| 972 | BTRFS_RAID_RAID5, | ||
| 973 | BTRFS_RAID_RAID6, | ||
| 974 | BTRFS_NR_RAID_TYPES | ||
| 975 | }; | ||
| 957 | 976 | ||
| 958 | #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ | 977 | #define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \ |
| 959 | BTRFS_BLOCK_GROUP_SYSTEM | \ | 978 | BTRFS_BLOCK_GROUP_SYSTEM | \ |
| @@ -961,6 +980,8 @@ struct btrfs_dev_replace_item { | |||
| 961 | 980 | ||
| 962 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ | 981 | #define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ |
| 963 | BTRFS_BLOCK_GROUP_RAID1 | \ | 982 | BTRFS_BLOCK_GROUP_RAID1 | \ |
| 983 | BTRFS_BLOCK_GROUP_RAID5 | \ | ||
| 984 | BTRFS_BLOCK_GROUP_RAID6 | \ | ||
| 964 | BTRFS_BLOCK_GROUP_DUP | \ | 985 | BTRFS_BLOCK_GROUP_DUP | \ |
| 965 | BTRFS_BLOCK_GROUP_RAID10) | 986 | BTRFS_BLOCK_GROUP_RAID10) |
| 966 | /* | 987 | /* |
| @@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache { | |||
| 1185 | u64 flags; | 1206 | u64 flags; |
| 1186 | u64 sectorsize; | 1207 | u64 sectorsize; |
| 1187 | u64 cache_generation; | 1208 | u64 cache_generation; |
| 1209 | |||
| 1210 | /* for raid56, this is a full stripe, without parity */ | ||
| 1211 | unsigned long full_stripe_len; | ||
| 1212 | |||
| 1188 | unsigned int ro:1; | 1213 | unsigned int ro:1; |
| 1189 | unsigned int dirty:1; | 1214 | unsigned int dirty:1; |
| 1190 | unsigned int iref:1; | 1215 | unsigned int iref:1; |
| @@ -1225,6 +1250,28 @@ struct seq_list { | |||
| 1225 | u64 seq; | 1250 | u64 seq; |
| 1226 | }; | 1251 | }; |
| 1227 | 1252 | ||
| 1253 | enum btrfs_orphan_cleanup_state { | ||
| 1254 | ORPHAN_CLEANUP_STARTED = 1, | ||
| 1255 | ORPHAN_CLEANUP_DONE = 2, | ||
| 1256 | }; | ||
| 1257 | |||
| 1258 | /* used by the raid56 code to lock stripes for read/modify/write */ | ||
| 1259 | struct btrfs_stripe_hash { | ||
| 1260 | struct list_head hash_list; | ||
| 1261 | wait_queue_head_t wait; | ||
| 1262 | spinlock_t lock; | ||
| 1263 | }; | ||
| 1264 | |||
| 1265 | /* used by the raid56 code to lock stripes for read/modify/write */ | ||
| 1266 | struct btrfs_stripe_hash_table { | ||
| 1267 | struct list_head stripe_cache; | ||
| 1268 | spinlock_t cache_lock; | ||
| 1269 | int cache_size; | ||
| 1270 | struct btrfs_stripe_hash table[]; | ||
| 1271 | }; | ||
| 1272 | |||
| 1273 | #define BTRFS_STRIPE_HASH_TABLE_BITS 11 | ||
| 1274 | |||
| 1228 | /* fs_info */ | 1275 | /* fs_info */ |
| 1229 | struct reloc_control; | 1276 | struct reloc_control; |
| 1230 | struct btrfs_device; | 1277 | struct btrfs_device; |
| @@ -1250,6 +1297,7 @@ struct btrfs_fs_info { | |||
| 1250 | 1297 | ||
| 1251 | /* block group cache stuff */ | 1298 | /* block group cache stuff */ |
| 1252 | spinlock_t block_group_cache_lock; | 1299 | spinlock_t block_group_cache_lock; |
| 1300 | u64 first_logical_byte; | ||
| 1253 | struct rb_root block_group_cache_tree; | 1301 | struct rb_root block_group_cache_tree; |
| 1254 | 1302 | ||
| 1255 | /* keep track of unallocated space */ | 1303 | /* keep track of unallocated space */ |
| @@ -1288,7 +1336,23 @@ struct btrfs_fs_info { | |||
| 1288 | u64 last_trans_log_full_commit; | 1336 | u64 last_trans_log_full_commit; |
| 1289 | unsigned long mount_opt; | 1337 | unsigned long mount_opt; |
| 1290 | unsigned long compress_type:4; | 1338 | unsigned long compress_type:4; |
| 1339 | /* | ||
| 1340 | * It is a suggestive number, the read side is safe even it gets a | ||
| 1341 | * wrong number because we will write out the data into a regular | ||
| 1342 | * extent. The write side(mount/remount) is under ->s_umount lock, | ||
| 1343 | * so it is also safe. | ||
| 1344 | */ | ||
| 1291 | u64 max_inline; | 1345 | u64 max_inline; |
| 1346 | /* | ||
| 1347 | * Protected by ->chunk_mutex and sb->s_umount. | ||
| 1348 | * | ||
| 1349 | * The reason that we use two lock to protect it is because only | ||
| 1350 | * remount and mount operations can change it and these two operations | ||
| 1351 | * are under sb->s_umount, but the read side (chunk allocation) can not | ||
| 1352 | * acquire sb->s_umount or the deadlock would happen. So we use two | ||
| 1353 | * locks to protect it. On the write side, we must acquire two locks, | ||
| 1354 | * and on the read side, we just need acquire one of them. | ||
| 1355 | */ | ||
| 1292 | u64 alloc_start; | 1356 | u64 alloc_start; |
| 1293 | struct btrfs_transaction *running_transaction; | 1357 | struct btrfs_transaction *running_transaction; |
| 1294 | wait_queue_head_t transaction_throttle; | 1358 | wait_queue_head_t transaction_throttle; |
| @@ -1307,6 +1371,13 @@ struct btrfs_fs_info { | |||
| 1307 | struct mutex cleaner_mutex; | 1371 | struct mutex cleaner_mutex; |
| 1308 | struct mutex chunk_mutex; | 1372 | struct mutex chunk_mutex; |
| 1309 | struct mutex volume_mutex; | 1373 | struct mutex volume_mutex; |
| 1374 | |||
| 1375 | /* this is used during read/modify/write to make sure | ||
| 1376 | * no two ios are trying to mod the same stripe at the same | ||
| 1377 | * time | ||
| 1378 | */ | ||
| 1379 | struct btrfs_stripe_hash_table *stripe_hash_table; | ||
| 1380 | |||
| 1310 | /* | 1381 | /* |
| 1311 | * this protects the ordered operations list only while we are | 1382 | * this protects the ordered operations list only while we are |
| 1312 | * processing all of the entries on it. This way we make | 1383 | * processing all of the entries on it. This way we make |
| @@ -1365,6 +1436,7 @@ struct btrfs_fs_info { | |||
| 1365 | */ | 1436 | */ |
| 1366 | struct list_head ordered_extents; | 1437 | struct list_head ordered_extents; |
| 1367 | 1438 | ||
| 1439 | spinlock_t delalloc_lock; | ||
| 1368 | /* | 1440 | /* |
| 1369 | * all of the inodes that have delalloc bytes. It is possible for | 1441 | * all of the inodes that have delalloc bytes. It is possible for |
| 1370 | * this list to be empty even when there is still dirty data=ordered | 1442 | * this list to be empty even when there is still dirty data=ordered |
| @@ -1373,13 +1445,6 @@ struct btrfs_fs_info { | |||
| 1373 | struct list_head delalloc_inodes; | 1445 | struct list_head delalloc_inodes; |
| 1374 | 1446 | ||
| 1375 | /* | 1447 | /* |
| 1376 | * special rename and truncate targets that must be on disk before | ||
| 1377 | * we're allowed to commit. This is basically the ext3 style | ||
| 1378 | * data=ordered list. | ||
| 1379 | */ | ||
| 1380 | struct list_head ordered_operations; | ||
| 1381 | |||
| 1382 | /* | ||
| 1383 | * there is a pool of worker threads for checksumming during writes | 1448 | * there is a pool of worker threads for checksumming during writes |
| 1384 | * and a pool for checksumming after reads. This is because readers | 1449 | * and a pool for checksumming after reads. This is because readers |
| 1385 | * can run with FS locks held, and the writers may be waiting for | 1450 | * can run with FS locks held, and the writers may be waiting for |
| @@ -1395,6 +1460,8 @@ struct btrfs_fs_info { | |||
| 1395 | struct btrfs_workers flush_workers; | 1460 | struct btrfs_workers flush_workers; |
| 1396 | struct btrfs_workers endio_workers; | 1461 | struct btrfs_workers endio_workers; |
| 1397 | struct btrfs_workers endio_meta_workers; | 1462 | struct btrfs_workers endio_meta_workers; |
| 1463 | struct btrfs_workers endio_raid56_workers; | ||
| 1464 | struct btrfs_workers rmw_workers; | ||
| 1398 | struct btrfs_workers endio_meta_write_workers; | 1465 | struct btrfs_workers endio_meta_write_workers; |
| 1399 | struct btrfs_workers endio_write_workers; | 1466 | struct btrfs_workers endio_write_workers; |
| 1400 | struct btrfs_workers endio_freespace_worker; | 1467 | struct btrfs_workers endio_freespace_worker; |
| @@ -1423,10 +1490,12 @@ struct btrfs_fs_info { | |||
| 1423 | 1490 | ||
| 1424 | u64 total_pinned; | 1491 | u64 total_pinned; |
| 1425 | 1492 | ||
| 1426 | /* protected by the delalloc lock, used to keep from writing | 1493 | /* used to keep from writing metadata until there is a nice batch */ |
| 1427 | * metadata until there is a nice batch | 1494 | struct percpu_counter dirty_metadata_bytes; |
| 1428 | */ | 1495 | struct percpu_counter delalloc_bytes; |
| 1429 | u64 dirty_metadata_bytes; | 1496 | s32 dirty_metadata_batch; |
| 1497 | s32 delalloc_batch; | ||
| 1498 | |||
| 1430 | struct list_head dirty_cowonly_roots; | 1499 | struct list_head dirty_cowonly_roots; |
| 1431 | 1500 | ||
| 1432 | struct btrfs_fs_devices *fs_devices; | 1501 | struct btrfs_fs_devices *fs_devices; |
| @@ -1442,9 +1511,6 @@ struct btrfs_fs_info { | |||
| 1442 | 1511 | ||
| 1443 | struct reloc_control *reloc_ctl; | 1512 | struct reloc_control *reloc_ctl; |
| 1444 | 1513 | ||
| 1445 | spinlock_t delalloc_lock; | ||
| 1446 | u64 delalloc_bytes; | ||
| 1447 | |||
| 1448 | /* data_alloc_cluster is only used in ssd mode */ | 1514 | /* data_alloc_cluster is only used in ssd mode */ |
| 1449 | struct btrfs_free_cluster data_alloc_cluster; | 1515 | struct btrfs_free_cluster data_alloc_cluster; |
| 1450 | 1516 | ||
| @@ -1456,6 +1522,8 @@ struct btrfs_fs_info { | |||
| 1456 | struct rb_root defrag_inodes; | 1522 | struct rb_root defrag_inodes; |
| 1457 | atomic_t defrag_running; | 1523 | atomic_t defrag_running; |
| 1458 | 1524 | ||
| 1525 | /* Used to protect avail_{data, metadata, system}_alloc_bits */ | ||
| 1526 | seqlock_t profiles_lock; | ||
| 1459 | /* | 1527 | /* |
| 1460 | * these three are in extended format (availability of single | 1528 | * these three are in extended format (availability of single |
| 1461 | * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other | 1529 | * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other |
| @@ -1520,7 +1588,7 @@ struct btrfs_fs_info { | |||
| 1520 | u64 qgroup_seq; | 1588 | u64 qgroup_seq; |
| 1521 | 1589 | ||
| 1522 | /* filesystem state */ | 1590 | /* filesystem state */ |
| 1523 | u64 fs_state; | 1591 | unsigned long fs_state; |
| 1524 | 1592 | ||
| 1525 | struct btrfs_delayed_root *delayed_root; | 1593 | struct btrfs_delayed_root *delayed_root; |
| 1526 | 1594 | ||
| @@ -1623,6 +1691,9 @@ struct btrfs_root { | |||
| 1623 | 1691 | ||
| 1624 | struct list_head root_list; | 1692 | struct list_head root_list; |
| 1625 | 1693 | ||
| 1694 | spinlock_t log_extents_lock[2]; | ||
| 1695 | struct list_head logged_list[2]; | ||
| 1696 | |||
| 1626 | spinlock_t orphan_lock; | 1697 | spinlock_t orphan_lock; |
| 1627 | atomic_t orphan_inodes; | 1698 | atomic_t orphan_inodes; |
| 1628 | struct btrfs_block_rsv *orphan_block_rsv; | 1699 | struct btrfs_block_rsv *orphan_block_rsv; |
| @@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args { | |||
| 1832 | 1903 | ||
| 1833 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) | 1904 | #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) |
| 1834 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) | 1905 | #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) |
| 1906 | #define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) | ||
| 1835 | #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ | 1907 | #define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ |
| 1836 | BTRFS_MOUNT_##opt) | 1908 | BTRFS_MOUNT_##opt) |
| 1837 | /* | 1909 | /* |
| @@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, | |||
| 2936 | u64 num_bytes, u64 *refs, u64 *flags); | 3008 | u64 num_bytes, u64 *refs, u64 *flags); |
| 2937 | int btrfs_pin_extent(struct btrfs_root *root, | 3009 | int btrfs_pin_extent(struct btrfs_root *root, |
| 2938 | u64 bytenr, u64 num, int reserved); | 3010 | u64 bytenr, u64 num, int reserved); |
| 2939 | int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, | 3011 | int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, |
| 2940 | struct btrfs_root *root, | ||
| 2941 | u64 bytenr, u64 num_bytes); | 3012 | u64 bytenr, u64 num_bytes); |
| 2942 | int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, | 3013 | int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, |
| 2943 | struct btrfs_root *root, | 3014 | struct btrfs_root *root, |
| @@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, | |||
| 3035 | int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, | 3106 | int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, |
| 3036 | struct inode *inode); | 3107 | struct inode *inode); |
| 3037 | void btrfs_orphan_release_metadata(struct inode *inode); | 3108 | void btrfs_orphan_release_metadata(struct inode *inode); |
| 3038 | int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, | 3109 | int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, |
| 3039 | struct btrfs_pending_snapshot *pending); | 3110 | struct btrfs_block_rsv *rsv, |
| 3111 | int nitems, | ||
| 3112 | u64 *qgroup_reserved); | ||
| 3113 | void btrfs_subvolume_release_metadata(struct btrfs_root *root, | ||
| 3114 | struct btrfs_block_rsv *rsv, | ||
| 3115 | u64 qgroup_reserved); | ||
| 3040 | int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); | 3116 | int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); |
| 3041 | void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); | 3117 | void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); |
| 3042 | int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); | 3118 | int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); |
| @@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root); | |||
| 3092 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); | 3168 | struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); |
| 3093 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, | 3169 | int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, |
| 3094 | struct btrfs_key *key, int lowest_level, | 3170 | struct btrfs_key *key, int lowest_level, |
| 3095 | int cache_only, u64 min_trans); | 3171 | u64 min_trans); |
| 3096 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, | 3172 | int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, |
| 3097 | struct btrfs_key *max_key, | 3173 | struct btrfs_key *max_key, |
| 3098 | struct btrfs_path *path, int cache_only, | 3174 | struct btrfs_path *path, |
| 3099 | u64 min_trans); | 3175 | u64 min_trans); |
| 3100 | enum btrfs_compare_tree_result { | 3176 | enum btrfs_compare_tree_result { |
| 3101 | BTRFS_COMPARE_TREE_NEW, | 3177 | BTRFS_COMPARE_TREE_NEW, |
| @@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root, | |||
| 3148 | int find_higher, int return_any); | 3224 | int find_higher, int return_any); |
| 3149 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, | 3225 | int btrfs_realloc_node(struct btrfs_trans_handle *trans, |
| 3150 | struct btrfs_root *root, struct extent_buffer *parent, | 3226 | struct btrfs_root *root, struct extent_buffer *parent, |
| 3151 | int start_slot, int cache_only, u64 *last_ret, | 3227 | int start_slot, u64 *last_ret, |
| 3152 | struct btrfs_key *progress); | 3228 | struct btrfs_key *progress); |
| 3153 | void btrfs_release_path(struct btrfs_path *p); | 3229 | void btrfs_release_path(struct btrfs_path *p); |
| 3154 | struct btrfs_path *btrfs_alloc_path(void); | 3230 | struct btrfs_path *btrfs_alloc_path(void); |
| @@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping, | |||
| 3459 | struct writeback_control *wbc); | 3535 | struct writeback_control *wbc); |
| 3460 | int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, | 3536 | int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, |
| 3461 | struct btrfs_root *new_root, u64 new_dirid); | 3537 | struct btrfs_root *new_root, u64 new_dirid); |
| 3462 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | 3538 | int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, |
| 3463 | size_t size, struct bio *bio, unsigned long bio_flags); | 3539 | size_t size, struct bio *bio, |
| 3464 | 3540 | unsigned long bio_flags); | |
| 3465 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); | 3541 | int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); |
| 3466 | int btrfs_readpage(struct file *file, struct page *page); | 3542 | int btrfs_readpage(struct file *file, struct page *page); |
| 3467 | void btrfs_evict_inode(struct inode *inode); | 3543 | void btrfs_evict_inode(struct inode *inode); |
| @@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, | |||
| 3543 | 3619 | ||
| 3544 | /* tree-defrag.c */ | 3620 | /* tree-defrag.c */ |
| 3545 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | 3621 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, |
| 3546 | struct btrfs_root *root, int cache_only); | 3622 | struct btrfs_root *root); |
| 3547 | 3623 | ||
| 3548 | /* sysfs.c */ | 3624 | /* sysfs.c */ |
| 3549 | int btrfs_init_sysfs(void); | 3625 | int btrfs_init_sysfs(void); |
| @@ -3620,11 +3696,14 @@ __printf(5, 6) | |||
| 3620 | void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, | 3696 | void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, |
| 3621 | unsigned int line, int errno, const char *fmt, ...); | 3697 | unsigned int line, int errno, const char *fmt, ...); |
| 3622 | 3698 | ||
| 3699 | /* | ||
| 3700 | * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic | ||
| 3701 | * will panic(). Otherwise we BUG() here. | ||
| 3702 | */ | ||
| 3623 | #define btrfs_panic(fs_info, errno, fmt, args...) \ | 3703 | #define btrfs_panic(fs_info, errno, fmt, args...) \ |
| 3624 | do { \ | 3704 | do { \ |
| 3625 | struct btrfs_fs_info *_i = (fs_info); \ | 3705 | __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ |
| 3626 | __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args); \ | 3706 | BUG(); \ |
| 3627 | BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)); \ | ||
| 3628 | } while (0) | 3707 | } while (0) |
| 3629 | 3708 | ||
| 3630 | /* acl.c */ | 3709 | /* acl.c */ |
| @@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid) | |||
| 3745 | return 1; | 3824 | return 1; |
| 3746 | return 0; | 3825 | return 0; |
| 3747 | } | 3826 | } |
| 3827 | |||
| 3828 | static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) | ||
| 3829 | { | ||
| 3830 | return signal_pending(current); | ||
| 3831 | } | ||
| 3832 | |||
| 3833 | |||
| 3748 | #endif | 3834 | #endif |
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 34836036f01b..0b278b117cbe 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c | |||
| @@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, | |||
| 875 | struct btrfs_delayed_item *delayed_item) | 875 | struct btrfs_delayed_item *delayed_item) |
| 876 | { | 876 | { |
| 877 | struct extent_buffer *leaf; | 877 | struct extent_buffer *leaf; |
| 878 | struct btrfs_item *item; | ||
| 879 | char *ptr; | 878 | char *ptr; |
| 880 | int ret; | 879 | int ret; |
| 881 | 880 | ||
| @@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, | |||
| 886 | 885 | ||
| 887 | leaf = path->nodes[0]; | 886 | leaf = path->nodes[0]; |
| 888 | 887 | ||
| 889 | item = btrfs_item_nr(leaf, path->slots[0]); | ||
| 890 | ptr = btrfs_item_ptr(leaf, path->slots[0], char); | 888 | ptr = btrfs_item_ptr(leaf, path->slots[0], char); |
| 891 | 889 | ||
| 892 | write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, | 890 | write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr, |
| @@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node) | |||
| 1065 | } | 1063 | } |
| 1066 | } | 1064 | } |
| 1067 | 1065 | ||
| 1068 | static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, | 1066 | static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, |
| 1069 | struct btrfs_root *root, | 1067 | struct btrfs_root *root, |
| 1070 | struct btrfs_path *path, | 1068 | struct btrfs_path *path, |
| 1071 | struct btrfs_delayed_node *node) | 1069 | struct btrfs_delayed_node *node) |
| 1072 | { | 1070 | { |
| 1073 | struct btrfs_key key; | 1071 | struct btrfs_key key; |
| 1074 | struct btrfs_inode_item *inode_item; | 1072 | struct btrfs_inode_item *inode_item; |
| 1075 | struct extent_buffer *leaf; | 1073 | struct extent_buffer *leaf; |
| 1076 | int ret; | 1074 | int ret; |
| 1077 | 1075 | ||
| 1078 | mutex_lock(&node->mutex); | ||
| 1079 | if (!node->inode_dirty) { | ||
| 1080 | mutex_unlock(&node->mutex); | ||
| 1081 | return 0; | ||
| 1082 | } | ||
| 1083 | |||
| 1084 | key.objectid = node->inode_id; | 1076 | key.objectid = node->inode_id; |
| 1085 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); | 1077 | btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); |
| 1086 | key.offset = 0; | 1078 | key.offset = 0; |
| 1079 | |||
| 1087 | ret = btrfs_lookup_inode(trans, root, path, &key, 1); | 1080 | ret = btrfs_lookup_inode(trans, root, path, &key, 1); |
| 1088 | if (ret > 0) { | 1081 | if (ret > 0) { |
| 1089 | btrfs_release_path(path); | 1082 | btrfs_release_path(path); |
| 1090 | mutex_unlock(&node->mutex); | ||
| 1091 | return -ENOENT; | 1083 | return -ENOENT; |
| 1092 | } else if (ret < 0) { | 1084 | } else if (ret < 0) { |
| 1093 | mutex_unlock(&node->mutex); | ||
| 1094 | return ret; | 1085 | return ret; |
| 1095 | } | 1086 | } |
| 1096 | 1087 | ||
| @@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, | |||
| 1105 | 1096 | ||
| 1106 | btrfs_delayed_inode_release_metadata(root, node); | 1097 | btrfs_delayed_inode_release_metadata(root, node); |
| 1107 | btrfs_release_delayed_inode(node); | 1098 | btrfs_release_delayed_inode(node); |
| 1108 | mutex_unlock(&node->mutex); | ||
| 1109 | 1099 | ||
| 1110 | return 0; | 1100 | return 0; |
| 1111 | } | 1101 | } |
| 1112 | 1102 | ||
| 1103 | static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, | ||
| 1104 | struct btrfs_root *root, | ||
| 1105 | struct btrfs_path *path, | ||
| 1106 | struct btrfs_delayed_node *node) | ||
| 1107 | { | ||
| 1108 | int ret; | ||
| 1109 | |||
| 1110 | mutex_lock(&node->mutex); | ||
| 1111 | if (!node->inode_dirty) { | ||
| 1112 | mutex_unlock(&node->mutex); | ||
| 1113 | return 0; | ||
| 1114 | } | ||
| 1115 | |||
| 1116 | ret = __btrfs_update_delayed_inode(trans, root, path, node); | ||
| 1117 | mutex_unlock(&node->mutex); | ||
| 1118 | return ret; | ||
| 1119 | } | ||
| 1120 | |||
| 1121 | static inline int | ||
| 1122 | __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | ||
| 1123 | struct btrfs_path *path, | ||
| 1124 | struct btrfs_delayed_node *node) | ||
| 1125 | { | ||
| 1126 | int ret; | ||
| 1127 | |||
| 1128 | ret = btrfs_insert_delayed_items(trans, path, node->root, node); | ||
| 1129 | if (ret) | ||
| 1130 | return ret; | ||
| 1131 | |||
| 1132 | ret = btrfs_delete_delayed_items(trans, path, node->root, node); | ||
| 1133 | if (ret) | ||
| 1134 | return ret; | ||
| 1135 | |||
| 1136 | ret = btrfs_update_delayed_inode(trans, node->root, path, node); | ||
| 1137 | return ret; | ||
| 1138 | } | ||
| 1139 | |||
| 1113 | /* | 1140 | /* |
| 1114 | * Called when committing the transaction. | 1141 | * Called when committing the transaction. |
| 1115 | * Returns 0 on success. | 1142 | * Returns 0 on success. |
| @@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, | |||
| 1119 | static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, | 1146 | static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, |
| 1120 | struct btrfs_root *root, int nr) | 1147 | struct btrfs_root *root, int nr) |
| 1121 | { | 1148 | { |
| 1122 | struct btrfs_root *curr_root = root; | ||
| 1123 | struct btrfs_delayed_root *delayed_root; | 1149 | struct btrfs_delayed_root *delayed_root; |
| 1124 | struct btrfs_delayed_node *curr_node, *prev_node; | 1150 | struct btrfs_delayed_node *curr_node, *prev_node; |
| 1125 | struct btrfs_path *path; | 1151 | struct btrfs_path *path; |
| @@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, | |||
| 1142 | 1168 | ||
| 1143 | curr_node = btrfs_first_delayed_node(delayed_root); | 1169 | curr_node = btrfs_first_delayed_node(delayed_root); |
| 1144 | while (curr_node && (!count || (count && nr--))) { | 1170 | while (curr_node && (!count || (count && nr--))) { |
| 1145 | curr_root = curr_node->root; | 1171 | ret = __btrfs_commit_inode_delayed_items(trans, path, |
| 1146 | ret = btrfs_insert_delayed_items(trans, path, curr_root, | 1172 | curr_node); |
| 1147 | curr_node); | ||
| 1148 | if (!ret) | ||
| 1149 | ret = btrfs_delete_delayed_items(trans, path, | ||
| 1150 | curr_root, curr_node); | ||
| 1151 | if (!ret) | ||
| 1152 | ret = btrfs_update_delayed_inode(trans, curr_root, | ||
| 1153 | path, curr_node); | ||
| 1154 | if (ret) { | 1173 | if (ret) { |
| 1155 | btrfs_release_delayed_node(curr_node); | 1174 | btrfs_release_delayed_node(curr_node); |
| 1156 | curr_node = NULL; | 1175 | curr_node = NULL; |
| @@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans, | |||
| 1183 | return __btrfs_run_delayed_items(trans, root, nr); | 1202 | return __btrfs_run_delayed_items(trans, root, nr); |
| 1184 | } | 1203 | } |
| 1185 | 1204 | ||
| 1186 | static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | 1205 | int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, |
| 1187 | struct btrfs_delayed_node *node) | 1206 | struct inode *inode) |
| 1188 | { | 1207 | { |
| 1208 | struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); | ||
| 1189 | struct btrfs_path *path; | 1209 | struct btrfs_path *path; |
| 1190 | struct btrfs_block_rsv *block_rsv; | 1210 | struct btrfs_block_rsv *block_rsv; |
| 1191 | int ret; | 1211 | int ret; |
| 1192 | 1212 | ||
| 1213 | if (!delayed_node) | ||
| 1214 | return 0; | ||
| 1215 | |||
| 1216 | mutex_lock(&delayed_node->mutex); | ||
| 1217 | if (!delayed_node->count) { | ||
| 1218 | mutex_unlock(&delayed_node->mutex); | ||
| 1219 | btrfs_release_delayed_node(delayed_node); | ||
| 1220 | return 0; | ||
| 1221 | } | ||
| 1222 | mutex_unlock(&delayed_node->mutex); | ||
| 1223 | |||
| 1193 | path = btrfs_alloc_path(); | 1224 | path = btrfs_alloc_path(); |
| 1194 | if (!path) | 1225 | if (!path) |
| 1195 | return -ENOMEM; | 1226 | return -ENOMEM; |
| 1196 | path->leave_spinning = 1; | 1227 | path->leave_spinning = 1; |
| 1197 | 1228 | ||
| 1198 | block_rsv = trans->block_rsv; | 1229 | block_rsv = trans->block_rsv; |
| 1199 | trans->block_rsv = &node->root->fs_info->delayed_block_rsv; | 1230 | trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; |
| 1200 | 1231 | ||
| 1201 | ret = btrfs_insert_delayed_items(trans, path, node->root, node); | 1232 | ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); |
| 1202 | if (!ret) | ||
| 1203 | ret = btrfs_delete_delayed_items(trans, path, node->root, node); | ||
| 1204 | if (!ret) | ||
| 1205 | ret = btrfs_update_delayed_inode(trans, node->root, path, node); | ||
| 1206 | btrfs_free_path(path); | ||
| 1207 | 1233 | ||
| 1234 | btrfs_release_delayed_node(delayed_node); | ||
| 1235 | btrfs_free_path(path); | ||
| 1208 | trans->block_rsv = block_rsv; | 1236 | trans->block_rsv = block_rsv; |
| 1237 | |||
| 1209 | return ret; | 1238 | return ret; |
| 1210 | } | 1239 | } |
| 1211 | 1240 | ||
| 1212 | int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | 1241 | int btrfs_commit_inode_delayed_inode(struct inode *inode) |
| 1213 | struct inode *inode) | ||
| 1214 | { | 1242 | { |
| 1243 | struct btrfs_trans_handle *trans; | ||
| 1215 | struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); | 1244 | struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); |
| 1245 | struct btrfs_path *path; | ||
| 1246 | struct btrfs_block_rsv *block_rsv; | ||
| 1216 | int ret; | 1247 | int ret; |
| 1217 | 1248 | ||
| 1218 | if (!delayed_node) | 1249 | if (!delayed_node) |
| 1219 | return 0; | 1250 | return 0; |
| 1220 | 1251 | ||
| 1221 | mutex_lock(&delayed_node->mutex); | 1252 | mutex_lock(&delayed_node->mutex); |
| 1222 | if (!delayed_node->count) { | 1253 | if (!delayed_node->inode_dirty) { |
| 1223 | mutex_unlock(&delayed_node->mutex); | 1254 | mutex_unlock(&delayed_node->mutex); |
| 1224 | btrfs_release_delayed_node(delayed_node); | 1255 | btrfs_release_delayed_node(delayed_node); |
| 1225 | return 0; | 1256 | return 0; |
| 1226 | } | 1257 | } |
| 1227 | mutex_unlock(&delayed_node->mutex); | 1258 | mutex_unlock(&delayed_node->mutex); |
| 1228 | 1259 | ||
| 1229 | ret = __btrfs_commit_inode_delayed_items(trans, delayed_node); | 1260 | trans = btrfs_join_transaction(delayed_node->root); |
| 1261 | if (IS_ERR(trans)) { | ||
| 1262 | ret = PTR_ERR(trans); | ||
| 1263 | goto out; | ||
| 1264 | } | ||
| 1265 | |||
| 1266 | path = btrfs_alloc_path(); | ||
| 1267 | if (!path) { | ||
| 1268 | ret = -ENOMEM; | ||
| 1269 | goto trans_out; | ||
| 1270 | } | ||
| 1271 | path->leave_spinning = 1; | ||
| 1272 | |||
| 1273 | block_rsv = trans->block_rsv; | ||
| 1274 | trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv; | ||
| 1275 | |||
| 1276 | mutex_lock(&delayed_node->mutex); | ||
| 1277 | if (delayed_node->inode_dirty) | ||
| 1278 | ret = __btrfs_update_delayed_inode(trans, delayed_node->root, | ||
| 1279 | path, delayed_node); | ||
| 1280 | else | ||
| 1281 | ret = 0; | ||
| 1282 | mutex_unlock(&delayed_node->mutex); | ||
| 1283 | |||
| 1284 | btrfs_free_path(path); | ||
| 1285 | trans->block_rsv = block_rsv; | ||
| 1286 | trans_out: | ||
| 1287 | btrfs_end_transaction(trans, delayed_node->root); | ||
| 1288 | btrfs_btree_balance_dirty(delayed_node->root); | ||
| 1289 | out: | ||
| 1230 | btrfs_release_delayed_node(delayed_node); | 1290 | btrfs_release_delayed_node(delayed_node); |
| 1291 | |||
| 1231 | return ret; | 1292 | return ret; |
| 1232 | } | 1293 | } |
| 1233 | 1294 | ||
| @@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) | |||
| 1258 | struct btrfs_root *root; | 1319 | struct btrfs_root *root; |
| 1259 | struct btrfs_block_rsv *block_rsv; | 1320 | struct btrfs_block_rsv *block_rsv; |
| 1260 | int need_requeue = 0; | 1321 | int need_requeue = 0; |
| 1261 | int ret; | ||
| 1262 | 1322 | ||
| 1263 | async_node = container_of(work, struct btrfs_async_delayed_node, work); | 1323 | async_node = container_of(work, struct btrfs_async_delayed_node, work); |
| 1264 | 1324 | ||
| @@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work) | |||
| 1277 | block_rsv = trans->block_rsv; | 1337 | block_rsv = trans->block_rsv; |
| 1278 | trans->block_rsv = &root->fs_info->delayed_block_rsv; | 1338 | trans->block_rsv = &root->fs_info->delayed_block_rsv; |
| 1279 | 1339 | ||
| 1280 | ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); | 1340 | __btrfs_commit_inode_delayed_items(trans, path, delayed_node); |
| 1281 | if (!ret) | ||
| 1282 | ret = btrfs_delete_delayed_items(trans, path, root, | ||
| 1283 | delayed_node); | ||
| 1284 | |||
| 1285 | if (!ret) | ||
| 1286 | btrfs_update_delayed_inode(trans, root, path, delayed_node); | ||
| 1287 | |||
| 1288 | /* | 1341 | /* |
| 1289 | * Maybe new delayed items have been inserted, so we need requeue | 1342 | * Maybe new delayed items have been inserted, so we need requeue |
| 1290 | * the work. Besides that, we must dequeue the empty delayed nodes | 1343 | * the work. Besides that, we must dequeue the empty delayed nodes |
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 4f808e1baeed..78b6ad0fc669 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h | |||
| @@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, | |||
| 117 | /* Used for evicting the inode. */ | 117 | /* Used for evicting the inode. */ |
| 118 | void btrfs_remove_delayed_node(struct inode *inode); | 118 | void btrfs_remove_delayed_node(struct inode *inode); |
| 119 | void btrfs_kill_delayed_inode_items(struct inode *inode); | 119 | void btrfs_kill_delayed_inode_items(struct inode *inode); |
| 120 | int btrfs_commit_inode_delayed_inode(struct inode *inode); | ||
| 120 | 121 | ||
| 121 | 122 | ||
| 122 | int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, | 123 | int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, |
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index ae9411773397..b7a0641ead77 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c | |||
| @@ -23,6 +23,10 @@ | |||
| 23 | #include "delayed-ref.h" | 23 | #include "delayed-ref.h" |
| 24 | #include "transaction.h" | 24 | #include "transaction.h" |
| 25 | 25 | ||
| 26 | struct kmem_cache *btrfs_delayed_ref_head_cachep; | ||
| 27 | struct kmem_cache *btrfs_delayed_tree_ref_cachep; | ||
| 28 | struct kmem_cache *btrfs_delayed_data_ref_cachep; | ||
| 29 | struct kmem_cache *btrfs_delayed_extent_op_cachep; | ||
| 26 | /* | 30 | /* |
| 27 | * delayed back reference update tracking. For subvolume trees | 31 | * delayed back reference update tracking. For subvolume trees |
| 28 | * we queue up extent allocations and backref maintenance for | 32 | * we queue up extent allocations and backref maintenance for |
| @@ -422,6 +426,14 @@ again: | |||
| 422 | return 1; | 426 | return 1; |
| 423 | } | 427 | } |
| 424 | 428 | ||
| 429 | void btrfs_release_ref_cluster(struct list_head *cluster) | ||
| 430 | { | ||
| 431 | struct list_head *pos, *q; | ||
| 432 | |||
| 433 | list_for_each_safe(pos, q, cluster) | ||
| 434 | list_del_init(pos); | ||
| 435 | } | ||
| 436 | |||
| 425 | /* | 437 | /* |
| 426 | * helper function to update an extent delayed ref in the | 438 | * helper function to update an extent delayed ref in the |
| 427 | * rbtree. existing and update must both have the same | 439 | * rbtree. existing and update must both have the same |
| @@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing, | |||
| 511 | ref->extent_op->flags_to_set; | 523 | ref->extent_op->flags_to_set; |
| 512 | existing_ref->extent_op->update_flags = 1; | 524 | existing_ref->extent_op->update_flags = 1; |
| 513 | } | 525 | } |
| 514 | kfree(ref->extent_op); | 526 | btrfs_free_delayed_extent_op(ref->extent_op); |
| 515 | } | 527 | } |
| 516 | } | 528 | } |
| 517 | /* | 529 | /* |
| @@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info, | |||
| 592 | * we've updated the existing ref, free the newly | 604 | * we've updated the existing ref, free the newly |
| 593 | * allocated ref | 605 | * allocated ref |
| 594 | */ | 606 | */ |
| 595 | kfree(head_ref); | 607 | kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); |
| 596 | } else { | 608 | } else { |
| 597 | delayed_refs->num_heads++; | 609 | delayed_refs->num_heads++; |
| 598 | delayed_refs->num_heads_ready++; | 610 | delayed_refs->num_heads_ready++; |
| @@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info, | |||
| 653 | * we've updated the existing ref, free the newly | 665 | * we've updated the existing ref, free the newly |
| 654 | * allocated ref | 666 | * allocated ref |
| 655 | */ | 667 | */ |
| 656 | kfree(full_ref); | 668 | kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); |
| 657 | } else { | 669 | } else { |
| 658 | delayed_refs->num_entries++; | 670 | delayed_refs->num_entries++; |
| 659 | trans->delayed_ref_updates++; | 671 | trans->delayed_ref_updates++; |
| @@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info, | |||
| 714 | * we've updated the existing ref, free the newly | 726 | * we've updated the existing ref, free the newly |
| 715 | * allocated ref | 727 | * allocated ref |
| 716 | */ | 728 | */ |
| 717 | kfree(full_ref); | 729 | kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); |
| 718 | } else { | 730 | } else { |
| 719 | delayed_refs->num_entries++; | 731 | delayed_refs->num_entries++; |
| 720 | trans->delayed_ref_updates++; | 732 | trans->delayed_ref_updates++; |
| @@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, | |||
| 738 | struct btrfs_delayed_ref_root *delayed_refs; | 750 | struct btrfs_delayed_ref_root *delayed_refs; |
| 739 | 751 | ||
| 740 | BUG_ON(extent_op && extent_op->is_data); | 752 | BUG_ON(extent_op && extent_op->is_data); |
| 741 | ref = kmalloc(sizeof(*ref), GFP_NOFS); | 753 | ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); |
| 742 | if (!ref) | 754 | if (!ref) |
| 743 | return -ENOMEM; | 755 | return -ENOMEM; |
| 744 | 756 | ||
| 745 | head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); | 757 | head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); |
| 746 | if (!head_ref) { | 758 | if (!head_ref) { |
| 747 | kfree(ref); | 759 | kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); |
| 748 | return -ENOMEM; | 760 | return -ENOMEM; |
| 749 | } | 761 | } |
| 750 | 762 | ||
| @@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, | |||
| 786 | struct btrfs_delayed_ref_root *delayed_refs; | 798 | struct btrfs_delayed_ref_root *delayed_refs; |
| 787 | 799 | ||
| 788 | BUG_ON(extent_op && !extent_op->is_data); | 800 | BUG_ON(extent_op && !extent_op->is_data); |
| 789 | ref = kmalloc(sizeof(*ref), GFP_NOFS); | 801 | ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); |
| 790 | if (!ref) | 802 | if (!ref) |
| 791 | return -ENOMEM; | 803 | return -ENOMEM; |
| 792 | 804 | ||
| 793 | head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); | 805 | head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); |
| 794 | if (!head_ref) { | 806 | if (!head_ref) { |
| 795 | kfree(ref); | 807 | kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); |
| 796 | return -ENOMEM; | 808 | return -ENOMEM; |
| 797 | } | 809 | } |
| 798 | 810 | ||
| @@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, | |||
| 826 | struct btrfs_delayed_ref_head *head_ref; | 838 | struct btrfs_delayed_ref_head *head_ref; |
| 827 | struct btrfs_delayed_ref_root *delayed_refs; | 839 | struct btrfs_delayed_ref_root *delayed_refs; |
| 828 | 840 | ||
| 829 | head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS); | 841 | head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); |
| 830 | if (!head_ref) | 842 | if (!head_ref) |
| 831 | return -ENOMEM; | 843 | return -ENOMEM; |
| 832 | 844 | ||
| @@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr) | |||
| 860 | return btrfs_delayed_node_to_head(ref); | 872 | return btrfs_delayed_node_to_head(ref); |
| 861 | return NULL; | 873 | return NULL; |
| 862 | } | 874 | } |
| 875 | |||
| 876 | void btrfs_delayed_ref_exit(void) | ||
| 877 | { | ||
| 878 | if (btrfs_delayed_ref_head_cachep) | ||
| 879 | kmem_cache_destroy(btrfs_delayed_ref_head_cachep); | ||
| 880 | if (btrfs_delayed_tree_ref_cachep) | ||
| 881 | kmem_cache_destroy(btrfs_delayed_tree_ref_cachep); | ||
| 882 | if (btrfs_delayed_data_ref_cachep) | ||
| 883 | kmem_cache_destroy(btrfs_delayed_data_ref_cachep); | ||
| 884 | if (btrfs_delayed_extent_op_cachep) | ||
| 885 | kmem_cache_destroy(btrfs_delayed_extent_op_cachep); | ||
| 886 | } | ||
| 887 | |||
| 888 | int btrfs_delayed_ref_init(void) | ||
| 889 | { | ||
| 890 | btrfs_delayed_ref_head_cachep = kmem_cache_create( | ||
| 891 | "btrfs_delayed_ref_head", | ||
| 892 | sizeof(struct btrfs_delayed_ref_head), 0, | ||
| 893 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | ||
| 894 | if (!btrfs_delayed_ref_head_cachep) | ||
| 895 | goto fail; | ||
| 896 | |||
| 897 | btrfs_delayed_tree_ref_cachep = kmem_cache_create( | ||
| 898 | "btrfs_delayed_tree_ref", | ||
| 899 | sizeof(struct btrfs_delayed_tree_ref), 0, | ||
| 900 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | ||
| 901 | if (!btrfs_delayed_tree_ref_cachep) | ||
| 902 | goto fail; | ||
| 903 | |||
| 904 | btrfs_delayed_data_ref_cachep = kmem_cache_create( | ||
| 905 | "btrfs_delayed_data_ref", | ||
| 906 | sizeof(struct btrfs_delayed_data_ref), 0, | ||
| 907 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | ||
| 908 | if (!btrfs_delayed_data_ref_cachep) | ||
| 909 | goto fail; | ||
| 910 | |||
| 911 | btrfs_delayed_extent_op_cachep = kmem_cache_create( | ||
| 912 | "btrfs_delayed_extent_op", | ||
| 913 | sizeof(struct btrfs_delayed_extent_op), 0, | ||
| 914 | SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL); | ||
| 915 | if (!btrfs_delayed_extent_op_cachep) | ||
| 916 | goto fail; | ||
| 917 | |||
| 918 | return 0; | ||
| 919 | fail: | ||
| 920 | btrfs_delayed_ref_exit(); | ||
| 921 | return -ENOMEM; | ||
| 922 | } | ||
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index c9d703693df0..f75fcaf79aeb 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h | |||
| @@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root { | |||
| 132 | unsigned long num_heads_ready; | 132 | unsigned long num_heads_ready; |
| 133 | 133 | ||
| 134 | /* | 134 | /* |
| 135 | * bumped when someone is making progress on the delayed | ||
| 136 | * refs, so that other procs know they are just adding to | ||
| 137 | * contention intead of helping | ||
| 138 | */ | ||
| 139 | atomic_t procs_running_refs; | ||
| 140 | atomic_t ref_seq; | ||
| 141 | wait_queue_head_t wait; | ||
| 142 | |||
| 143 | /* | ||
| 135 | * set when the tree is flushing before a transaction commit, | 144 | * set when the tree is flushing before a transaction commit, |
| 136 | * used by the throttling code to decide if new updates need | 145 | * used by the throttling code to decide if new updates need |
| 137 | * to be run right away | 146 | * to be run right away |
| @@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root { | |||
| 141 | u64 run_delayed_start; | 150 | u64 run_delayed_start; |
| 142 | }; | 151 | }; |
| 143 | 152 | ||
| 153 | extern struct kmem_cache *btrfs_delayed_ref_head_cachep; | ||
| 154 | extern struct kmem_cache *btrfs_delayed_tree_ref_cachep; | ||
| 155 | extern struct kmem_cache *btrfs_delayed_data_ref_cachep; | ||
| 156 | extern struct kmem_cache *btrfs_delayed_extent_op_cachep; | ||
| 157 | |||
| 158 | int btrfs_delayed_ref_init(void); | ||
| 159 | void btrfs_delayed_ref_exit(void); | ||
| 160 | |||
| 161 | static inline struct btrfs_delayed_extent_op * | ||
| 162 | btrfs_alloc_delayed_extent_op(void) | ||
| 163 | { | ||
| 164 | return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS); | ||
| 165 | } | ||
| 166 | |||
| 167 | static inline void | ||
| 168 | btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) | ||
| 169 | { | ||
| 170 | if (op) | ||
| 171 | kmem_cache_free(btrfs_delayed_extent_op_cachep, op); | ||
| 172 | } | ||
| 173 | |||
| 144 | static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) | 174 | static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) |
| 145 | { | 175 | { |
| 146 | WARN_ON(atomic_read(&ref->refs) == 0); | 176 | WARN_ON(atomic_read(&ref->refs) == 0); |
| 147 | if (atomic_dec_and_test(&ref->refs)) { | 177 | if (atomic_dec_and_test(&ref->refs)) { |
| 148 | WARN_ON(ref->in_tree); | 178 | WARN_ON(ref->in_tree); |
| 149 | kfree(ref); | 179 | switch (ref->type) { |
| 180 | case BTRFS_TREE_BLOCK_REF_KEY: | ||
| 181 | case BTRFS_SHARED_BLOCK_REF_KEY: | ||
| 182 | kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); | ||
| 183 | break; | ||
| 184 | case BTRFS_EXTENT_DATA_REF_KEY: | ||
| 185 | case BTRFS_SHARED_DATA_REF_KEY: | ||
| 186 | kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); | ||
| 187 | break; | ||
| 188 | case 0: | ||
| 189 | kmem_cache_free(btrfs_delayed_ref_head_cachep, ref); | ||
| 190 | break; | ||
| 191 | default: | ||
| 192 | BUG(); | ||
| 193 | } | ||
| 150 | } | 194 | } |
| 151 | } | 195 | } |
| 152 | 196 | ||
| @@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head * | |||
| 176 | btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); | 220 | btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); |
| 177 | int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, | 221 | int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, |
| 178 | struct btrfs_delayed_ref_head *head); | 222 | struct btrfs_delayed_ref_head *head); |
| 223 | static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head) | ||
| 224 | { | ||
| 225 | mutex_unlock(&head->mutex); | ||
| 226 | } | ||
| 227 | |||
| 179 | int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, | 228 | int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, |
| 180 | struct list_head *cluster, u64 search_start); | 229 | struct list_head *cluster, u64 search_start); |
| 230 | void btrfs_release_ref_cluster(struct list_head *cluster); | ||
| 181 | 231 | ||
| 182 | int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, | 232 | int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, |
| 183 | struct btrfs_delayed_ref_root *delayed_refs, | 233 | struct btrfs_delayed_ref_root *delayed_refs, |
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 66dbc8dbddf7..7ba7b3900cb8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c | |||
| @@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, | |||
| 465 | * flush all outstanding I/O and inode extent mappings before the | 465 | * flush all outstanding I/O and inode extent mappings before the |
| 466 | * copy operation is declared as being finished | 466 | * copy operation is declared as being finished |
| 467 | */ | 467 | */ |
| 468 | btrfs_start_delalloc_inodes(root, 0); | 468 | ret = btrfs_start_delalloc_inodes(root, 0); |
| 469 | if (ret) { | ||
| 470 | mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); | ||
| 471 | return ret; | ||
| 472 | } | ||
| 469 | btrfs_wait_ordered_extents(root, 0); | 473 | btrfs_wait_ordered_extents(root, 0); |
| 470 | 474 | ||
| 471 | trans = btrfs_start_transaction(root, 0); | 475 | trans = btrfs_start_transaction(root, 0); |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a8f652dc940b..02369a3c162e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
| @@ -46,6 +46,7 @@ | |||
| 46 | #include "check-integrity.h" | 46 | #include "check-integrity.h" |
| 47 | #include "rcu-string.h" | 47 | #include "rcu-string.h" |
| 48 | #include "dev-replace.h" | 48 | #include "dev-replace.h" |
| 49 | #include "raid56.h" | ||
| 49 | 50 | ||
| 50 | #ifdef CONFIG_X86 | 51 | #ifdef CONFIG_X86 |
| 51 | #include <asm/cpufeature.h> | 52 | #include <asm/cpufeature.h> |
| @@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work); | |||
| 56 | static void free_fs_root(struct btrfs_root *root); | 57 | static void free_fs_root(struct btrfs_root *root); |
| 57 | static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, | 58 | static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, |
| 58 | int read_only); | 59 | int read_only); |
| 59 | static void btrfs_destroy_ordered_operations(struct btrfs_root *root); | 60 | static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, |
| 61 | struct btrfs_root *root); | ||
| 60 | static void btrfs_destroy_ordered_extents(struct btrfs_root *root); | 62 | static void btrfs_destroy_ordered_extents(struct btrfs_root *root); |
| 61 | static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, | 63 | static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, |
| 62 | struct btrfs_root *root); | 64 | struct btrfs_root *root); |
| @@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root, | |||
| 420 | static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) | 422 | static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) |
| 421 | { | 423 | { |
| 422 | struct extent_io_tree *tree; | 424 | struct extent_io_tree *tree; |
| 423 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 425 | u64 start = page_offset(page); |
| 424 | u64 found_start; | 426 | u64 found_start; |
| 425 | struct extent_buffer *eb; | 427 | struct extent_buffer *eb; |
| 426 | 428 | ||
| @@ -639,8 +641,15 @@ err: | |||
| 639 | btree_readahead_hook(root, eb, eb->start, ret); | 641 | btree_readahead_hook(root, eb, eb->start, ret); |
| 640 | } | 642 | } |
| 641 | 643 | ||
| 642 | if (ret) | 644 | if (ret) { |
| 645 | /* | ||
| 646 | * our io error hook is going to dec the io pages | ||
| 647 | * again, we have to make sure it has something | ||
| 648 | * to decrement | ||
| 649 | */ | ||
| 650 | atomic_inc(&eb->io_pages); | ||
| 643 | clear_extent_buffer_uptodate(eb); | 651 | clear_extent_buffer_uptodate(eb); |
| 652 | } | ||
| 644 | free_extent_buffer(eb); | 653 | free_extent_buffer(eb); |
| 645 | out: | 654 | out: |
| 646 | return ret; | 655 | return ret; |
| @@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) | |||
| 654 | eb = (struct extent_buffer *)page->private; | 663 | eb = (struct extent_buffer *)page->private; |
| 655 | set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); | 664 | set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); |
| 656 | eb->read_mirror = failed_mirror; | 665 | eb->read_mirror = failed_mirror; |
| 666 | atomic_dec(&eb->io_pages); | ||
| 657 | if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) | 667 | if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) |
| 658 | btree_readahead_hook(root, eb, eb->start, -EIO); | 668 | btree_readahead_hook(root, eb, eb->start, -EIO); |
| 659 | return -EIO; /* we fixed nothing */ | 669 | return -EIO; /* we fixed nothing */ |
| @@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
| 670 | end_io_wq->work.flags = 0; | 680 | end_io_wq->work.flags = 0; |
| 671 | 681 | ||
| 672 | if (bio->bi_rw & REQ_WRITE) { | 682 | if (bio->bi_rw & REQ_WRITE) { |
| 673 | if (end_io_wq->metadata == 1) | 683 | if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) |
| 674 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, | 684 | btrfs_queue_worker(&fs_info->endio_meta_write_workers, |
| 675 | &end_io_wq->work); | 685 | &end_io_wq->work); |
| 676 | else if (end_io_wq->metadata == 2) | 686 | else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) |
| 677 | btrfs_queue_worker(&fs_info->endio_freespace_worker, | 687 | btrfs_queue_worker(&fs_info->endio_freespace_worker, |
| 678 | &end_io_wq->work); | 688 | &end_io_wq->work); |
| 689 | else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) | ||
| 690 | btrfs_queue_worker(&fs_info->endio_raid56_workers, | ||
| 691 | &end_io_wq->work); | ||
| 679 | else | 692 | else |
| 680 | btrfs_queue_worker(&fs_info->endio_write_workers, | 693 | btrfs_queue_worker(&fs_info->endio_write_workers, |
| 681 | &end_io_wq->work); | 694 | &end_io_wq->work); |
| 682 | } else { | 695 | } else { |
| 683 | if (end_io_wq->metadata) | 696 | if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) |
| 697 | btrfs_queue_worker(&fs_info->endio_raid56_workers, | ||
| 698 | &end_io_wq->work); | ||
| 699 | else if (end_io_wq->metadata) | ||
| 684 | btrfs_queue_worker(&fs_info->endio_meta_workers, | 700 | btrfs_queue_worker(&fs_info->endio_meta_workers, |
| 685 | &end_io_wq->work); | 701 | &end_io_wq->work); |
| 686 | else | 702 | else |
| @@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err) | |||
| 695 | * 0 - if data | 711 | * 0 - if data |
| 696 | * 1 - if normal metadta | 712 | * 1 - if normal metadta |
| 697 | * 2 - if writing to the free space cache area | 713 | * 2 - if writing to the free space cache area |
| 714 | * 3 - raid parity work | ||
| 698 | */ | 715 | */ |
| 699 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, | 716 | int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, |
| 700 | int metadata) | 717 | int metadata) |
| @@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping, | |||
| 946 | struct writeback_control *wbc) | 963 | struct writeback_control *wbc) |
| 947 | { | 964 | { |
| 948 | struct extent_io_tree *tree; | 965 | struct extent_io_tree *tree; |
| 966 | struct btrfs_fs_info *fs_info; | ||
| 967 | int ret; | ||
| 968 | |||
| 949 | tree = &BTRFS_I(mapping->host)->io_tree; | 969 | tree = &BTRFS_I(mapping->host)->io_tree; |
| 950 | if (wbc->sync_mode == WB_SYNC_NONE) { | 970 | if (wbc->sync_mode == WB_SYNC_NONE) { |
| 951 | struct btrfs_root *root = BTRFS_I(mapping->host)->root; | ||
| 952 | u64 num_dirty; | ||
| 953 | unsigned long thresh = 32 * 1024 * 1024; | ||
| 954 | 971 | ||
| 955 | if (wbc->for_kupdate) | 972 | if (wbc->for_kupdate) |
| 956 | return 0; | 973 | return 0; |
| 957 | 974 | ||
| 975 | fs_info = BTRFS_I(mapping->host)->root->fs_info; | ||
| 958 | /* this is a bit racy, but that's ok */ | 976 | /* this is a bit racy, but that's ok */ |
| 959 | num_dirty = root->fs_info->dirty_metadata_bytes; | 977 | ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes, |
| 960 | if (num_dirty < thresh) | 978 | BTRFS_DIRTY_METADATA_THRESH); |
| 979 | if (ret < 0) | ||
| 961 | return 0; | 980 | return 0; |
| 962 | } | 981 | } |
| 963 | return btree_write_cache_pages(mapping, wbc); | 982 | return btree_write_cache_pages(mapping, wbc); |
| @@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, | |||
| 1125 | void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, | 1144 | void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, |
| 1126 | struct extent_buffer *buf) | 1145 | struct extent_buffer *buf) |
| 1127 | { | 1146 | { |
| 1147 | struct btrfs_fs_info *fs_info = root->fs_info; | ||
| 1148 | |||
| 1128 | if (btrfs_header_generation(buf) == | 1149 | if (btrfs_header_generation(buf) == |
| 1129 | root->fs_info->running_transaction->transid) { | 1150 | fs_info->running_transaction->transid) { |
| 1130 | btrfs_assert_tree_locked(buf); | 1151 | btrfs_assert_tree_locked(buf); |
| 1131 | 1152 | ||
| 1132 | if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { | 1153 | if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) { |
| 1133 | spin_lock(&root->fs_info->delalloc_lock); | 1154 | __percpu_counter_add(&fs_info->dirty_metadata_bytes, |
| 1134 | if (root->fs_info->dirty_metadata_bytes >= buf->len) | 1155 | -buf->len, |
| 1135 | root->fs_info->dirty_metadata_bytes -= buf->len; | 1156 | fs_info->dirty_metadata_batch); |
| 1136 | else { | ||
| 1137 | spin_unlock(&root->fs_info->delalloc_lock); | ||
| 1138 | btrfs_panic(root->fs_info, -EOVERFLOW, | ||
| 1139 | "Can't clear %lu bytes from " | ||
| 1140 | " dirty_mdatadata_bytes (%llu)", | ||
| 1141 | buf->len, | ||
| 1142 | root->fs_info->dirty_metadata_bytes); | ||
| 1143 | } | ||
| 1144 | spin_unlock(&root->fs_info->delalloc_lock); | ||
| 1145 | |||
| 1146 | /* ugh, clear_extent_buffer_dirty needs to lock the page */ | 1157 | /* ugh, clear_extent_buffer_dirty needs to lock the page */ |
| 1147 | btrfs_set_lock_blocking(buf); | 1158 | btrfs_set_lock_blocking(buf); |
| 1148 | clear_extent_buffer_dirty(buf); | 1159 | clear_extent_buffer_dirty(buf); |
| @@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, | |||
| 1178 | 1189 | ||
| 1179 | INIT_LIST_HEAD(&root->dirty_list); | 1190 | INIT_LIST_HEAD(&root->dirty_list); |
| 1180 | INIT_LIST_HEAD(&root->root_list); | 1191 | INIT_LIST_HEAD(&root->root_list); |
| 1192 | INIT_LIST_HEAD(&root->logged_list[0]); | ||
| 1193 | INIT_LIST_HEAD(&root->logged_list[1]); | ||
| 1181 | spin_lock_init(&root->orphan_lock); | 1194 | spin_lock_init(&root->orphan_lock); |
| 1182 | spin_lock_init(&root->inode_lock); | 1195 | spin_lock_init(&root->inode_lock); |
| 1183 | spin_lock_init(&root->accounting_lock); | 1196 | spin_lock_init(&root->accounting_lock); |
| 1197 | spin_lock_init(&root->log_extents_lock[0]); | ||
| 1198 | spin_lock_init(&root->log_extents_lock[1]); | ||
| 1184 | mutex_init(&root->objectid_mutex); | 1199 | mutex_init(&root->objectid_mutex); |
| 1185 | mutex_init(&root->log_mutex); | 1200 | mutex_init(&root->log_mutex); |
| 1186 | init_waitqueue_head(&root->log_writer_wait); | 1201 | init_waitqueue_head(&root->log_writer_wait); |
| @@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb, | |||
| 2004 | goto fail_srcu; | 2019 | goto fail_srcu; |
| 2005 | } | 2020 | } |
| 2006 | 2021 | ||
| 2022 | ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); | ||
| 2023 | if (ret) { | ||
| 2024 | err = ret; | ||
| 2025 | goto fail_bdi; | ||
| 2026 | } | ||
| 2027 | fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * | ||
| 2028 | (1 + ilog2(nr_cpu_ids)); | ||
| 2029 | |||
| 2030 | ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); | ||
| 2031 | if (ret) { | ||
| 2032 | err = ret; | ||
| 2033 | goto fail_dirty_metadata_bytes; | ||
| 2034 | } | ||
| 2035 | |||
| 2007 | fs_info->btree_inode = new_inode(sb); | 2036 | fs_info->btree_inode = new_inode(sb); |
| 2008 | if (!fs_info->btree_inode) { | 2037 | if (!fs_info->btree_inode) { |
| 2009 | err = -ENOMEM; | 2038 | err = -ENOMEM; |
| 2010 | goto fail_bdi; | 2039 | goto fail_delalloc_bytes; |
| 2011 | } | 2040 | } |
| 2012 | 2041 | ||
| 2013 | mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); | 2042 | mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS); |
| @@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb, | |||
| 2017 | INIT_LIST_HEAD(&fs_info->dead_roots); | 2046 | INIT_LIST_HEAD(&fs_info->dead_roots); |
| 2018 | INIT_LIST_HEAD(&fs_info->delayed_iputs); | 2047 | INIT_LIST_HEAD(&fs_info->delayed_iputs); |
| 2019 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); | 2048 | INIT_LIST_HEAD(&fs_info->delalloc_inodes); |
| 2020 | INIT_LIST_HEAD(&fs_info->ordered_operations); | ||
| 2021 | INIT_LIST_HEAD(&fs_info->caching_block_groups); | 2049 | INIT_LIST_HEAD(&fs_info->caching_block_groups); |
| 2022 | spin_lock_init(&fs_info->delalloc_lock); | 2050 | spin_lock_init(&fs_info->delalloc_lock); |
| 2023 | spin_lock_init(&fs_info->trans_lock); | 2051 | spin_lock_init(&fs_info->trans_lock); |
| @@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb, | |||
| 2028 | spin_lock_init(&fs_info->tree_mod_seq_lock); | 2056 | spin_lock_init(&fs_info->tree_mod_seq_lock); |
| 2029 | rwlock_init(&fs_info->tree_mod_log_lock); | 2057 | rwlock_init(&fs_info->tree_mod_log_lock); |
| 2030 | mutex_init(&fs_info->reloc_mutex); | 2058 | mutex_init(&fs_info->reloc_mutex); |
| 2059 | seqlock_init(&fs_info->profiles_lock); | ||
| 2031 | 2060 | ||
| 2032 | init_completion(&fs_info->kobj_unregister); | 2061 | init_completion(&fs_info->kobj_unregister); |
| 2033 | INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); | 2062 | INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); |
| @@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb, | |||
| 2126 | 2155 | ||
| 2127 | spin_lock_init(&fs_info->block_group_cache_lock); | 2156 | spin_lock_init(&fs_info->block_group_cache_lock); |
| 2128 | fs_info->block_group_cache_tree = RB_ROOT; | 2157 | fs_info->block_group_cache_tree = RB_ROOT; |
| 2158 | fs_info->first_logical_byte = (u64)-1; | ||
| 2129 | 2159 | ||
| 2130 | extent_io_tree_init(&fs_info->freed_extents[0], | 2160 | extent_io_tree_init(&fs_info->freed_extents[0], |
| 2131 | fs_info->btree_inode->i_mapping); | 2161 | fs_info->btree_inode->i_mapping); |
| @@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb, | |||
| 2165 | init_waitqueue_head(&fs_info->transaction_blocked_wait); | 2195 | init_waitqueue_head(&fs_info->transaction_blocked_wait); |
| 2166 | init_waitqueue_head(&fs_info->async_submit_wait); | 2196 | init_waitqueue_head(&fs_info->async_submit_wait); |
| 2167 | 2197 | ||
| 2198 | ret = btrfs_alloc_stripe_hash_table(fs_info); | ||
| 2199 | if (ret) { | ||
| 2200 | err = ret; | ||
| 2201 | goto fail_alloc; | ||
| 2202 | } | ||
| 2203 | |||
| 2168 | __setup_root(4096, 4096, 4096, 4096, tree_root, | 2204 | __setup_root(4096, 4096, 4096, 4096, tree_root, |
| 2169 | fs_info, BTRFS_ROOT_TREE_OBJECTID); | 2205 | fs_info, BTRFS_ROOT_TREE_OBJECTID); |
| 2170 | 2206 | ||
| @@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb, | |||
| 2187 | goto fail_alloc; | 2223 | goto fail_alloc; |
| 2188 | 2224 | ||
| 2189 | /* check FS state, whether FS is broken. */ | 2225 | /* check FS state, whether FS is broken. */ |
| 2190 | fs_info->fs_state |= btrfs_super_flags(disk_super); | 2226 | if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) |
| 2227 | set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); | ||
| 2191 | 2228 | ||
| 2192 | ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); | 2229 | ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); |
| 2193 | if (ret) { | 2230 | if (ret) { |
| @@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb, | |||
| 2261 | leafsize = btrfs_super_leafsize(disk_super); | 2298 | leafsize = btrfs_super_leafsize(disk_super); |
| 2262 | sectorsize = btrfs_super_sectorsize(disk_super); | 2299 | sectorsize = btrfs_super_sectorsize(disk_super); |
| 2263 | stripesize = btrfs_super_stripesize(disk_super); | 2300 | stripesize = btrfs_super_stripesize(disk_super); |
| 2301 | fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); | ||
| 2302 | fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); | ||
| 2264 | 2303 | ||
| 2265 | /* | 2304 | /* |
| 2266 | * mixed block groups end up with duplicate but slightly offset | 2305 | * mixed block groups end up with duplicate but slightly offset |
| @@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb, | |||
| 2332 | btrfs_init_workers(&fs_info->endio_meta_write_workers, | 2371 | btrfs_init_workers(&fs_info->endio_meta_write_workers, |
| 2333 | "endio-meta-write", fs_info->thread_pool_size, | 2372 | "endio-meta-write", fs_info->thread_pool_size, |
| 2334 | &fs_info->generic_worker); | 2373 | &fs_info->generic_worker); |
| 2374 | btrfs_init_workers(&fs_info->endio_raid56_workers, | ||
| 2375 | "endio-raid56", fs_info->thread_pool_size, | ||
| 2376 | &fs_info->generic_worker); | ||
| 2377 | btrfs_init_workers(&fs_info->rmw_workers, | ||
| 2378 | "rmw", fs_info->thread_pool_size, | ||
| 2379 | &fs_info->generic_worker); | ||
| 2335 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", | 2380 | btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", |
| 2336 | fs_info->thread_pool_size, | 2381 | fs_info->thread_pool_size, |
| 2337 | &fs_info->generic_worker); | 2382 | &fs_info->generic_worker); |
| @@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb, | |||
| 2350 | */ | 2395 | */ |
| 2351 | fs_info->endio_workers.idle_thresh = 4; | 2396 | fs_info->endio_workers.idle_thresh = 4; |
| 2352 | fs_info->endio_meta_workers.idle_thresh = 4; | 2397 | fs_info->endio_meta_workers.idle_thresh = 4; |
| 2398 | fs_info->endio_raid56_workers.idle_thresh = 4; | ||
| 2399 | fs_info->rmw_workers.idle_thresh = 2; | ||
| 2353 | 2400 | ||
| 2354 | fs_info->endio_write_workers.idle_thresh = 2; | 2401 | fs_info->endio_write_workers.idle_thresh = 2; |
| 2355 | fs_info->endio_meta_write_workers.idle_thresh = 2; | 2402 | fs_info->endio_meta_write_workers.idle_thresh = 2; |
| @@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb, | |||
| 2366 | ret |= btrfs_start_workers(&fs_info->fixup_workers); | 2413 | ret |= btrfs_start_workers(&fs_info->fixup_workers); |
| 2367 | ret |= btrfs_start_workers(&fs_info->endio_workers); | 2414 | ret |= btrfs_start_workers(&fs_info->endio_workers); |
| 2368 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); | 2415 | ret |= btrfs_start_workers(&fs_info->endio_meta_workers); |
| 2416 | ret |= btrfs_start_workers(&fs_info->rmw_workers); | ||
| 2417 | ret |= btrfs_start_workers(&fs_info->endio_raid56_workers); | ||
| 2369 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); | 2418 | ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers); |
| 2370 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); | 2419 | ret |= btrfs_start_workers(&fs_info->endio_write_workers); |
| 2371 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); | 2420 | ret |= btrfs_start_workers(&fs_info->endio_freespace_worker); |
| @@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb, | |||
| 2390 | sb->s_blocksize = sectorsize; | 2439 | sb->s_blocksize = sectorsize; |
| 2391 | sb->s_blocksize_bits = blksize_bits(sectorsize); | 2440 | sb->s_blocksize_bits = blksize_bits(sectorsize); |
| 2392 | 2441 | ||
| 2393 | if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, | 2442 | if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) { |
| 2394 | sizeof(disk_super->magic))) { | ||
| 2395 | printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); | 2443 | printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); |
| 2396 | goto fail_sb_buffer; | 2444 | goto fail_sb_buffer; |
| 2397 | } | 2445 | } |
| @@ -2694,13 +2742,13 @@ fail_cleaner: | |||
| 2694 | * kthreads | 2742 | * kthreads |
| 2695 | */ | 2743 | */ |
| 2696 | filemap_write_and_wait(fs_info->btree_inode->i_mapping); | 2744 | filemap_write_and_wait(fs_info->btree_inode->i_mapping); |
| 2697 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | ||
| 2698 | 2745 | ||
| 2699 | fail_block_groups: | 2746 | fail_block_groups: |
| 2700 | btrfs_free_block_groups(fs_info); | 2747 | btrfs_free_block_groups(fs_info); |
| 2701 | 2748 | ||
| 2702 | fail_tree_roots: | 2749 | fail_tree_roots: |
| 2703 | free_root_pointers(fs_info, 1); | 2750 | free_root_pointers(fs_info, 1); |
| 2751 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | ||
| 2704 | 2752 | ||
| 2705 | fail_sb_buffer: | 2753 | fail_sb_buffer: |
| 2706 | btrfs_stop_workers(&fs_info->generic_worker); | 2754 | btrfs_stop_workers(&fs_info->generic_worker); |
| @@ -2710,6 +2758,8 @@ fail_sb_buffer: | |||
| 2710 | btrfs_stop_workers(&fs_info->workers); | 2758 | btrfs_stop_workers(&fs_info->workers); |
| 2711 | btrfs_stop_workers(&fs_info->endio_workers); | 2759 | btrfs_stop_workers(&fs_info->endio_workers); |
| 2712 | btrfs_stop_workers(&fs_info->endio_meta_workers); | 2760 | btrfs_stop_workers(&fs_info->endio_meta_workers); |
| 2761 | btrfs_stop_workers(&fs_info->endio_raid56_workers); | ||
| 2762 | btrfs_stop_workers(&fs_info->rmw_workers); | ||
| 2713 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | 2763 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); |
| 2714 | btrfs_stop_workers(&fs_info->endio_write_workers); | 2764 | btrfs_stop_workers(&fs_info->endio_write_workers); |
| 2715 | btrfs_stop_workers(&fs_info->endio_freespace_worker); | 2765 | btrfs_stop_workers(&fs_info->endio_freespace_worker); |
| @@ -2721,13 +2771,17 @@ fail_alloc: | |||
| 2721 | fail_iput: | 2771 | fail_iput: |
| 2722 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | 2772 | btrfs_mapping_tree_free(&fs_info->mapping_tree); |
| 2723 | 2773 | ||
| 2724 | invalidate_inode_pages2(fs_info->btree_inode->i_mapping); | ||
| 2725 | iput(fs_info->btree_inode); | 2774 | iput(fs_info->btree_inode); |
| 2775 | fail_delalloc_bytes: | ||
| 2776 | percpu_counter_destroy(&fs_info->delalloc_bytes); | ||
| 2777 | fail_dirty_metadata_bytes: | ||
| 2778 | percpu_counter_destroy(&fs_info->dirty_metadata_bytes); | ||
| 2726 | fail_bdi: | 2779 | fail_bdi: |
| 2727 | bdi_destroy(&fs_info->bdi); | 2780 | bdi_destroy(&fs_info->bdi); |
| 2728 | fail_srcu: | 2781 | fail_srcu: |
| 2729 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 2782 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
| 2730 | fail: | 2783 | fail: |
| 2784 | btrfs_free_stripe_hash_table(fs_info); | ||
| 2731 | btrfs_close_devices(fs_info->fs_devices); | 2785 | btrfs_close_devices(fs_info->fs_devices); |
| 2732 | return err; | 2786 | return err; |
| 2733 | 2787 | ||
| @@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) | |||
| 2795 | 2849 | ||
| 2796 | super = (struct btrfs_super_block *)bh->b_data; | 2850 | super = (struct btrfs_super_block *)bh->b_data; |
| 2797 | if (btrfs_super_bytenr(super) != bytenr || | 2851 | if (btrfs_super_bytenr(super) != bytenr || |
| 2798 | strncmp((char *)(&super->magic), BTRFS_MAGIC, | 2852 | super->magic != cpu_to_le64(BTRFS_MAGIC)) { |
| 2799 | sizeof(super->magic))) { | ||
| 2800 | brelse(bh); | 2853 | brelse(bh); |
| 2801 | continue; | 2854 | continue; |
| 2802 | } | 2855 | } |
| @@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures( | |||
| 3076 | ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) | 3129 | ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) |
| 3077 | == 0))) | 3130 | == 0))) |
| 3078 | num_tolerated_disk_barrier_failures = 0; | 3131 | num_tolerated_disk_barrier_failures = 0; |
| 3079 | else if (num_tolerated_disk_barrier_failures > 1 | 3132 | else if (num_tolerated_disk_barrier_failures > 1) { |
| 3080 | && | 3133 | if (flags & (BTRFS_BLOCK_GROUP_RAID1 | |
| 3081 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 3134 | BTRFS_BLOCK_GROUP_RAID5 | |
| 3082 | BTRFS_BLOCK_GROUP_RAID10))) | 3135 | BTRFS_BLOCK_GROUP_RAID10)) { |
| 3083 | num_tolerated_disk_barrier_failures = 1; | 3136 | num_tolerated_disk_barrier_failures = 1; |
| 3137 | } else if (flags & | ||
| 3138 | BTRFS_BLOCK_GROUP_RAID5) { | ||
| 3139 | num_tolerated_disk_barrier_failures = 2; | ||
| 3140 | } | ||
| 3141 | } | ||
| 3084 | } | 3142 | } |
| 3085 | } | 3143 | } |
| 3086 | up_read(&sinfo->groups_sem); | 3144 | up_read(&sinfo->groups_sem); |
| @@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) | |||
| 3195 | if (btrfs_root_refs(&root->root_item) == 0) | 3253 | if (btrfs_root_refs(&root->root_item) == 0) |
| 3196 | synchronize_srcu(&fs_info->subvol_srcu); | 3254 | synchronize_srcu(&fs_info->subvol_srcu); |
| 3197 | 3255 | ||
| 3256 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | ||
| 3257 | btrfs_free_log(NULL, root); | ||
| 3258 | btrfs_free_log_root_tree(NULL, fs_info); | ||
| 3259 | } | ||
| 3260 | |||
| 3198 | __btrfs_remove_free_space_cache(root->free_ino_pinned); | 3261 | __btrfs_remove_free_space_cache(root->free_ino_pinned); |
| 3199 | __btrfs_remove_free_space_cache(root->free_ino_ctl); | 3262 | __btrfs_remove_free_space_cache(root->free_ino_ctl); |
| 3200 | free_fs_root(root); | 3263 | free_fs_root(root); |
| @@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root) | |||
| 3339 | printk(KERN_ERR "btrfs: commit super ret %d\n", ret); | 3402 | printk(KERN_ERR "btrfs: commit super ret %d\n", ret); |
| 3340 | } | 3403 | } |
| 3341 | 3404 | ||
| 3342 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) | 3405 | if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) |
| 3343 | btrfs_error_commit_super(root); | 3406 | btrfs_error_commit_super(root); |
| 3344 | 3407 | ||
| 3345 | btrfs_put_block_group_cache(fs_info); | 3408 | btrfs_put_block_group_cache(fs_info); |
| @@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root) | |||
| 3352 | 3415 | ||
| 3353 | btrfs_free_qgroup_config(root->fs_info); | 3416 | btrfs_free_qgroup_config(root->fs_info); |
| 3354 | 3417 | ||
| 3355 | if (fs_info->delalloc_bytes) { | 3418 | if (percpu_counter_sum(&fs_info->delalloc_bytes)) { |
| 3356 | printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", | 3419 | printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n", |
| 3357 | (unsigned long long)fs_info->delalloc_bytes); | 3420 | percpu_counter_sum(&fs_info->delalloc_bytes)); |
| 3358 | } | 3421 | } |
| 3359 | 3422 | ||
| 3360 | free_extent_buffer(fs_info->extent_root->node); | 3423 | free_extent_buffer(fs_info->extent_root->node); |
| @@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root) | |||
| 3384 | btrfs_stop_workers(&fs_info->workers); | 3447 | btrfs_stop_workers(&fs_info->workers); |
| 3385 | btrfs_stop_workers(&fs_info->endio_workers); | 3448 | btrfs_stop_workers(&fs_info->endio_workers); |
| 3386 | btrfs_stop_workers(&fs_info->endio_meta_workers); | 3449 | btrfs_stop_workers(&fs_info->endio_meta_workers); |
| 3450 | btrfs_stop_workers(&fs_info->endio_raid56_workers); | ||
| 3451 | btrfs_stop_workers(&fs_info->rmw_workers); | ||
| 3387 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); | 3452 | btrfs_stop_workers(&fs_info->endio_meta_write_workers); |
| 3388 | btrfs_stop_workers(&fs_info->endio_write_workers); | 3453 | btrfs_stop_workers(&fs_info->endio_write_workers); |
| 3389 | btrfs_stop_workers(&fs_info->endio_freespace_worker); | 3454 | btrfs_stop_workers(&fs_info->endio_freespace_worker); |
| @@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root) | |||
| 3401 | btrfs_close_devices(fs_info->fs_devices); | 3466 | btrfs_close_devices(fs_info->fs_devices); |
| 3402 | btrfs_mapping_tree_free(&fs_info->mapping_tree); | 3467 | btrfs_mapping_tree_free(&fs_info->mapping_tree); |
| 3403 | 3468 | ||
| 3469 | percpu_counter_destroy(&fs_info->dirty_metadata_bytes); | ||
| 3470 | percpu_counter_destroy(&fs_info->delalloc_bytes); | ||
| 3404 | bdi_destroy(&fs_info->bdi); | 3471 | bdi_destroy(&fs_info->bdi); |
| 3405 | cleanup_srcu_struct(&fs_info->subvol_srcu); | 3472 | cleanup_srcu_struct(&fs_info->subvol_srcu); |
| 3406 | 3473 | ||
| 3474 | btrfs_free_stripe_hash_table(fs_info); | ||
| 3475 | |||
| 3407 | return 0; | 3476 | return 0; |
| 3408 | } | 3477 | } |
| 3409 | 3478 | ||
| @@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) | |||
| 3443 | (unsigned long long)transid, | 3512 | (unsigned long long)transid, |
| 3444 | (unsigned long long)root->fs_info->generation); | 3513 | (unsigned long long)root->fs_info->generation); |
| 3445 | was_dirty = set_extent_buffer_dirty(buf); | 3514 | was_dirty = set_extent_buffer_dirty(buf); |
| 3446 | if (!was_dirty) { | 3515 | if (!was_dirty) |
| 3447 | spin_lock(&root->fs_info->delalloc_lock); | 3516 | __percpu_counter_add(&root->fs_info->dirty_metadata_bytes, |
| 3448 | root->fs_info->dirty_metadata_bytes += buf->len; | 3517 | buf->len, |
| 3449 | spin_unlock(&root->fs_info->delalloc_lock); | 3518 | root->fs_info->dirty_metadata_batch); |
| 3450 | } | ||
| 3451 | } | 3519 | } |
| 3452 | 3520 | ||
| 3453 | static void __btrfs_btree_balance_dirty(struct btrfs_root *root, | 3521 | static void __btrfs_btree_balance_dirty(struct btrfs_root *root, |
| @@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, | |||
| 3457 | * looks as though older kernels can get into trouble with | 3525 | * looks as though older kernels can get into trouble with |
| 3458 | * this code, they end up stuck in balance_dirty_pages forever | 3526 | * this code, they end up stuck in balance_dirty_pages forever |
| 3459 | */ | 3527 | */ |
| 3460 | u64 num_dirty; | 3528 | int ret; |
| 3461 | unsigned long thresh = 32 * 1024 * 1024; | ||
| 3462 | 3529 | ||
| 3463 | if (current->flags & PF_MEMALLOC) | 3530 | if (current->flags & PF_MEMALLOC) |
| 3464 | return; | 3531 | return; |
| @@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, | |||
| 3466 | if (flush_delayed) | 3533 | if (flush_delayed) |
| 3467 | btrfs_balance_delayed_items(root); | 3534 | btrfs_balance_delayed_items(root); |
| 3468 | 3535 | ||
| 3469 | num_dirty = root->fs_info->dirty_metadata_bytes; | 3536 | ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes, |
| 3470 | 3537 | BTRFS_DIRTY_METADATA_THRESH); | |
| 3471 | if (num_dirty > thresh) { | 3538 | if (ret > 0) { |
| 3472 | balance_dirty_pages_ratelimited( | 3539 | balance_dirty_pages_ratelimited( |
| 3473 | root->fs_info->btree_inode->i_mapping); | 3540 | root->fs_info->btree_inode->i_mapping); |
| 3474 | } | 3541 | } |
| @@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root) | |||
| 3518 | btrfs_cleanup_transaction(root); | 3585 | btrfs_cleanup_transaction(root); |
| 3519 | } | 3586 | } |
| 3520 | 3587 | ||
| 3521 | static void btrfs_destroy_ordered_operations(struct btrfs_root *root) | 3588 | static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t, |
| 3589 | struct btrfs_root *root) | ||
| 3522 | { | 3590 | { |
| 3523 | struct btrfs_inode *btrfs_inode; | 3591 | struct btrfs_inode *btrfs_inode; |
| 3524 | struct list_head splice; | 3592 | struct list_head splice; |
| @@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) | |||
| 3528 | mutex_lock(&root->fs_info->ordered_operations_mutex); | 3596 | mutex_lock(&root->fs_info->ordered_operations_mutex); |
| 3529 | spin_lock(&root->fs_info->ordered_extent_lock); | 3597 | spin_lock(&root->fs_info->ordered_extent_lock); |
| 3530 | 3598 | ||
| 3531 | list_splice_init(&root->fs_info->ordered_operations, &splice); | 3599 | list_splice_init(&t->ordered_operations, &splice); |
| 3532 | while (!list_empty(&splice)) { | 3600 | while (!list_empty(&splice)) { |
| 3533 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, | 3601 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, |
| 3534 | ordered_operations); | 3602 | ordered_operations); |
| @@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root) | |||
| 3544 | 3612 | ||
| 3545 | static void btrfs_destroy_ordered_extents(struct btrfs_root *root) | 3613 | static void btrfs_destroy_ordered_extents(struct btrfs_root *root) |
| 3546 | { | 3614 | { |
| 3547 | struct list_head splice; | ||
| 3548 | struct btrfs_ordered_extent *ordered; | 3615 | struct btrfs_ordered_extent *ordered; |
| 3549 | struct inode *inode; | ||
| 3550 | |||
| 3551 | INIT_LIST_HEAD(&splice); | ||
| 3552 | 3616 | ||
| 3553 | spin_lock(&root->fs_info->ordered_extent_lock); | 3617 | spin_lock(&root->fs_info->ordered_extent_lock); |
| 3554 | 3618 | /* | |
| 3555 | list_splice_init(&root->fs_info->ordered_extents, &splice); | 3619 | * This will just short circuit the ordered completion stuff which will |
| 3556 | while (!list_empty(&splice)) { | 3620 | * make sure the ordered extent gets properly cleaned up. |
| 3557 | ordered = list_entry(splice.next, struct btrfs_ordered_extent, | 3621 | */ |
| 3558 | root_extent_list); | 3622 | list_for_each_entry(ordered, &root->fs_info->ordered_extents, |
| 3559 | 3623 | root_extent_list) | |
| 3560 | list_del_init(&ordered->root_extent_list); | 3624 | set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); |
| 3561 | atomic_inc(&ordered->refs); | ||
| 3562 | |||
| 3563 | /* the inode may be getting freed (in sys_unlink path). */ | ||
| 3564 | inode = igrab(ordered->inode); | ||
| 3565 | |||
| 3566 | spin_unlock(&root->fs_info->ordered_extent_lock); | ||
| 3567 | if (inode) | ||
| 3568 | iput(inode); | ||
| 3569 | |||
| 3570 | atomic_set(&ordered->refs, 1); | ||
| 3571 | btrfs_put_ordered_extent(ordered); | ||
| 3572 | |||
| 3573 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
| 3574 | } | ||
| 3575 | |||
| 3576 | spin_unlock(&root->fs_info->ordered_extent_lock); | 3625 | spin_unlock(&root->fs_info->ordered_extent_lock); |
| 3577 | } | 3626 | } |
| 3578 | 3627 | ||
| @@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, | |||
| 3594 | } | 3643 | } |
| 3595 | 3644 | ||
| 3596 | while ((node = rb_first(&delayed_refs->root)) != NULL) { | 3645 | while ((node = rb_first(&delayed_refs->root)) != NULL) { |
| 3597 | ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); | 3646 | struct btrfs_delayed_ref_head *head = NULL; |
| 3598 | 3647 | ||
| 3648 | ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); | ||
| 3599 | atomic_set(&ref->refs, 1); | 3649 | atomic_set(&ref->refs, 1); |
| 3600 | if (btrfs_delayed_ref_is_head(ref)) { | 3650 | if (btrfs_delayed_ref_is_head(ref)) { |
| 3601 | struct btrfs_delayed_ref_head *head; | ||
| 3602 | 3651 | ||
| 3603 | head = btrfs_delayed_node_to_head(ref); | 3652 | head = btrfs_delayed_node_to_head(ref); |
| 3604 | if (!mutex_trylock(&head->mutex)) { | 3653 | if (!mutex_trylock(&head->mutex)) { |
| @@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, | |||
| 3614 | continue; | 3663 | continue; |
| 3615 | } | 3664 | } |
| 3616 | 3665 | ||
| 3617 | kfree(head->extent_op); | 3666 | btrfs_free_delayed_extent_op(head->extent_op); |
| 3618 | delayed_refs->num_heads--; | 3667 | delayed_refs->num_heads--; |
| 3619 | if (list_empty(&head->cluster)) | 3668 | if (list_empty(&head->cluster)) |
| 3620 | delayed_refs->num_heads_ready--; | 3669 | delayed_refs->num_heads_ready--; |
| 3621 | list_del_init(&head->cluster); | 3670 | list_del_init(&head->cluster); |
| 3622 | } | 3671 | } |
| 3672 | |||
| 3623 | ref->in_tree = 0; | 3673 | ref->in_tree = 0; |
| 3624 | rb_erase(&ref->rb_node, &delayed_refs->root); | 3674 | rb_erase(&ref->rb_node, &delayed_refs->root); |
| 3625 | delayed_refs->num_entries--; | 3675 | delayed_refs->num_entries--; |
| 3626 | 3676 | if (head) | |
| 3677 | mutex_unlock(&head->mutex); | ||
| 3627 | spin_unlock(&delayed_refs->lock); | 3678 | spin_unlock(&delayed_refs->lock); |
| 3628 | btrfs_put_delayed_ref(ref); | 3679 | btrfs_put_delayed_ref(ref); |
| 3629 | 3680 | ||
| @@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root) | |||
| 3671 | delalloc_inodes); | 3722 | delalloc_inodes); |
| 3672 | 3723 | ||
| 3673 | list_del_init(&btrfs_inode->delalloc_inodes); | 3724 | list_del_init(&btrfs_inode->delalloc_inodes); |
| 3725 | clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, | ||
| 3726 | &btrfs_inode->runtime_flags); | ||
| 3674 | 3727 | ||
| 3675 | btrfs_invalidate_inodes(btrfs_inode->root); | 3728 | btrfs_invalidate_inodes(btrfs_inode->root); |
| 3676 | } | 3729 | } |
| @@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root) | |||
| 3823 | 3876 | ||
| 3824 | while (!list_empty(&list)) { | 3877 | while (!list_empty(&list)) { |
| 3825 | t = list_entry(list.next, struct btrfs_transaction, list); | 3878 | t = list_entry(list.next, struct btrfs_transaction, list); |
| 3826 | if (!t) | ||
| 3827 | break; | ||
| 3828 | 3879 | ||
| 3829 | btrfs_destroy_ordered_operations(root); | 3880 | btrfs_destroy_ordered_operations(t, root); |
| 3830 | 3881 | ||
| 3831 | btrfs_destroy_ordered_extents(root); | 3882 | btrfs_destroy_ordered_extents(root); |
| 3832 | 3883 | ||
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 305c33efb0e3..034d7dc552b2 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h | |||
| @@ -25,6 +25,13 @@ | |||
| 25 | #define BTRFS_SUPER_MIRROR_MAX 3 | 25 | #define BTRFS_SUPER_MIRROR_MAX 3 |
| 26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 | 26 | #define BTRFS_SUPER_MIRROR_SHIFT 12 |
| 27 | 27 | ||
| 28 | enum { | ||
| 29 | BTRFS_WQ_ENDIO_DATA = 0, | ||
| 30 | BTRFS_WQ_ENDIO_METADATA = 1, | ||
| 31 | BTRFS_WQ_ENDIO_FREE_SPACE = 2, | ||
| 32 | BTRFS_WQ_ENDIO_RAID56 = 3, | ||
| 33 | }; | ||
| 34 | |||
| 28 | static inline u64 btrfs_sb_offset(int mirror) | 35 | static inline u64 btrfs_sb_offset(int mirror) |
| 29 | { | 36 | { |
| 30 | u64 start = 16 * 1024; | 37 | u64 start = 16 * 1024; |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cf54bdfee334..3e074dab2d57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
| @@ -31,6 +31,7 @@ | |||
| 31 | #include "print-tree.h" | 31 | #include "print-tree.h" |
| 32 | #include "transaction.h" | 32 | #include "transaction.h" |
| 33 | #include "volumes.h" | 33 | #include "volumes.h" |
| 34 | #include "raid56.h" | ||
| 34 | #include "locking.h" | 35 | #include "locking.h" |
| 35 | #include "free-space-cache.h" | 36 | #include "free-space-cache.h" |
| 36 | #include "math.h" | 37 | #include "math.h" |
| @@ -72,8 +73,7 @@ enum { | |||
| 72 | RESERVE_ALLOC_NO_ACCOUNT = 2, | 73 | RESERVE_ALLOC_NO_ACCOUNT = 2, |
| 73 | }; | 74 | }; |
| 74 | 75 | ||
| 75 | static int update_block_group(struct btrfs_trans_handle *trans, | 76 | static int update_block_group(struct btrfs_root *root, |
| 76 | struct btrfs_root *root, | ||
| 77 | u64 bytenr, u64 num_bytes, int alloc); | 77 | u64 bytenr, u64 num_bytes, int alloc); |
| 78 | static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | 78 | static int __btrfs_free_extent(struct btrfs_trans_handle *trans, |
| 79 | struct btrfs_root *root, | 79 | struct btrfs_root *root, |
| @@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, | |||
| 103 | int dump_block_groups); | 103 | int dump_block_groups); |
| 104 | static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, | 104 | static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, |
| 105 | u64 num_bytes, int reserve); | 105 | u64 num_bytes, int reserve); |
| 106 | static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, | ||
| 107 | u64 num_bytes); | ||
| 106 | 108 | ||
| 107 | static noinline int | 109 | static noinline int |
| 108 | block_group_cache_done(struct btrfs_block_group_cache *cache) | 110 | block_group_cache_done(struct btrfs_block_group_cache *cache) |
| @@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, | |||
| 162 | rb_link_node(&block_group->cache_node, parent, p); | 164 | rb_link_node(&block_group->cache_node, parent, p); |
| 163 | rb_insert_color(&block_group->cache_node, | 165 | rb_insert_color(&block_group->cache_node, |
| 164 | &info->block_group_cache_tree); | 166 | &info->block_group_cache_tree); |
| 167 | |||
| 168 | if (info->first_logical_byte > block_group->key.objectid) | ||
| 169 | info->first_logical_byte = block_group->key.objectid; | ||
| 170 | |||
| 165 | spin_unlock(&info->block_group_cache_lock); | 171 | spin_unlock(&info->block_group_cache_lock); |
| 166 | 172 | ||
| 167 | return 0; | 173 | return 0; |
| @@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, | |||
| 203 | break; | 209 | break; |
| 204 | } | 210 | } |
| 205 | } | 211 | } |
| 206 | if (ret) | 212 | if (ret) { |
| 207 | btrfs_get_block_group(ret); | 213 | btrfs_get_block_group(ret); |
| 214 | if (bytenr == 0 && info->first_logical_byte > ret->key.objectid) | ||
| 215 | info->first_logical_byte = ret->key.objectid; | ||
| 216 | } | ||
| 208 | spin_unlock(&info->block_group_cache_lock); | 217 | spin_unlock(&info->block_group_cache_lock); |
| 209 | 218 | ||
| 210 | return ret; | 219 | return ret; |
| @@ -468,8 +477,6 @@ out: | |||
| 468 | } | 477 | } |
| 469 | 478 | ||
| 470 | static int cache_block_group(struct btrfs_block_group_cache *cache, | 479 | static int cache_block_group(struct btrfs_block_group_cache *cache, |
| 471 | struct btrfs_trans_handle *trans, | ||
| 472 | struct btrfs_root *root, | ||
| 473 | int load_cache_only) | 480 | int load_cache_only) |
| 474 | { | 481 | { |
| 475 | DEFINE_WAIT(wait); | 482 | DEFINE_WAIT(wait); |
| @@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, | |||
| 527 | cache->cached = BTRFS_CACHE_FAST; | 534 | cache->cached = BTRFS_CACHE_FAST; |
| 528 | spin_unlock(&cache->lock); | 535 | spin_unlock(&cache->lock); |
| 529 | 536 | ||
| 530 | /* | ||
| 531 | * We can't do the read from on-disk cache during a commit since we need | ||
| 532 | * to have the normal tree locking. Also if we are currently trying to | ||
| 533 | * allocate blocks for the tree root we can't do the fast caching since | ||
| 534 | * we likely hold important locks. | ||
| 535 | */ | ||
| 536 | if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { | 537 | if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) { |
| 537 | ret = load_free_space_cache(fs_info, cache); | 538 | ret = load_free_space_cache(fs_info, cache); |
| 538 | 539 | ||
| @@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, | |||
| 1852 | *actual_bytes = discarded_bytes; | 1853 | *actual_bytes = discarded_bytes; |
| 1853 | 1854 | ||
| 1854 | 1855 | ||
| 1856 | if (ret == -EOPNOTSUPP) | ||
| 1857 | ret = 0; | ||
| 1855 | return ret; | 1858 | return ret; |
| 1856 | } | 1859 | } |
| 1857 | 1860 | ||
| @@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, | |||
| 2143 | node->num_bytes); | 2146 | node->num_bytes); |
| 2144 | } | 2147 | } |
| 2145 | } | 2148 | } |
| 2146 | mutex_unlock(&head->mutex); | ||
| 2147 | return ret; | 2149 | return ret; |
| 2148 | } | 2150 | } |
| 2149 | 2151 | ||
| @@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
| 2258 | * process of being added. Don't run this ref yet. | 2260 | * process of being added. Don't run this ref yet. |
| 2259 | */ | 2261 | */ |
| 2260 | list_del_init(&locked_ref->cluster); | 2262 | list_del_init(&locked_ref->cluster); |
| 2261 | mutex_unlock(&locked_ref->mutex); | 2263 | btrfs_delayed_ref_unlock(locked_ref); |
| 2262 | locked_ref = NULL; | 2264 | locked_ref = NULL; |
| 2263 | delayed_refs->num_heads_ready++; | 2265 | delayed_refs->num_heads_ready++; |
| 2264 | spin_unlock(&delayed_refs->lock); | 2266 | spin_unlock(&delayed_refs->lock); |
| @@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
| 2285 | ref = &locked_ref->node; | 2287 | ref = &locked_ref->node; |
| 2286 | 2288 | ||
| 2287 | if (extent_op && must_insert_reserved) { | 2289 | if (extent_op && must_insert_reserved) { |
| 2288 | kfree(extent_op); | 2290 | btrfs_free_delayed_extent_op(extent_op); |
| 2289 | extent_op = NULL; | 2291 | extent_op = NULL; |
| 2290 | } | 2292 | } |
| 2291 | 2293 | ||
| @@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
| 2294 | 2296 | ||
| 2295 | ret = run_delayed_extent_op(trans, root, | 2297 | ret = run_delayed_extent_op(trans, root, |
| 2296 | ref, extent_op); | 2298 | ref, extent_op); |
| 2297 | kfree(extent_op); | 2299 | btrfs_free_delayed_extent_op(extent_op); |
| 2298 | 2300 | ||
| 2299 | if (ret) { | 2301 | if (ret) { |
| 2300 | list_del_init(&locked_ref->cluster); | 2302 | printk(KERN_DEBUG |
| 2301 | mutex_unlock(&locked_ref->mutex); | 2303 | "btrfs: run_delayed_extent_op " |
| 2302 | 2304 | "returned %d\n", ret); | |
| 2303 | printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret); | ||
| 2304 | spin_lock(&delayed_refs->lock); | 2305 | spin_lock(&delayed_refs->lock); |
| 2306 | btrfs_delayed_ref_unlock(locked_ref); | ||
| 2305 | return ret; | 2307 | return ret; |
| 2306 | } | 2308 | } |
| 2307 | 2309 | ||
| 2308 | goto next; | 2310 | goto next; |
| 2309 | } | 2311 | } |
| 2310 | |||
| 2311 | list_del_init(&locked_ref->cluster); | ||
| 2312 | locked_ref = NULL; | ||
| 2313 | } | 2312 | } |
| 2314 | 2313 | ||
| 2315 | ref->in_tree = 0; | 2314 | ref->in_tree = 0; |
| 2316 | rb_erase(&ref->rb_node, &delayed_refs->root); | 2315 | rb_erase(&ref->rb_node, &delayed_refs->root); |
| 2317 | delayed_refs->num_entries--; | 2316 | delayed_refs->num_entries--; |
| 2318 | if (locked_ref) { | 2317 | if (!btrfs_delayed_ref_is_head(ref)) { |
| 2319 | /* | 2318 | /* |
| 2320 | * when we play the delayed ref, also correct the | 2319 | * when we play the delayed ref, also correct the |
| 2321 | * ref_mod on head | 2320 | * ref_mod on head |
| @@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, | |||
| 2337 | ret = run_one_delayed_ref(trans, root, ref, extent_op, | 2336 | ret = run_one_delayed_ref(trans, root, ref, extent_op, |
| 2338 | must_insert_reserved); | 2337 | must_insert_reserved); |
| 2339 | 2338 | ||
| 2340 | btrfs_put_delayed_ref(ref); | 2339 | btrfs_free_delayed_extent_op(extent_op); |
| 2341 | kfree(extent_op); | ||
| 2342 | count++; | ||
| 2343 | |||
| 2344 | if (ret) { | 2340 | if (ret) { |
| 2345 | if (locked_ref) { | 2341 | btrfs_delayed_ref_unlock(locked_ref); |
| 2346 | list_del_init(&locked_ref->cluster); | 2342 | btrfs_put_delayed_ref(ref); |
| 2347 | mutex_unlock(&locked_ref->mutex); | 2343 | printk(KERN_DEBUG |
| 2348 | } | 2344 | "btrfs: run_one_delayed_ref returned %d\n", ret); |
| 2349 | printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret); | ||
| 2350 | spin_lock(&delayed_refs->lock); | 2345 | spin_lock(&delayed_refs->lock); |
| 2351 | return ret; | 2346 | return ret; |
| 2352 | } | 2347 | } |
| 2353 | 2348 | ||
| 2349 | /* | ||
| 2350 | * If this node is a head, that means all the refs in this head | ||
| 2351 | * have been dealt with, and we will pick the next head to deal | ||
| 2352 | * with, so we must unlock the head and drop it from the cluster | ||
| 2353 | * list before we release it. | ||
| 2354 | */ | ||
| 2355 | if (btrfs_delayed_ref_is_head(ref)) { | ||
| 2356 | list_del_init(&locked_ref->cluster); | ||
| 2357 | btrfs_delayed_ref_unlock(locked_ref); | ||
| 2358 | locked_ref = NULL; | ||
| 2359 | } | ||
| 2360 | btrfs_put_delayed_ref(ref); | ||
| 2361 | count++; | ||
| 2354 | next: | 2362 | next: |
| 2355 | cond_resched(); | 2363 | cond_resched(); |
| 2356 | spin_lock(&delayed_refs->lock); | 2364 | spin_lock(&delayed_refs->lock); |
| @@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, | |||
| 2435 | return ret; | 2443 | return ret; |
| 2436 | } | 2444 | } |
| 2437 | 2445 | ||
| 2446 | static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, | ||
| 2447 | int count) | ||
| 2448 | { | ||
| 2449 | int val = atomic_read(&delayed_refs->ref_seq); | ||
| 2450 | |||
| 2451 | if (val < seq || val >= seq + count) | ||
| 2452 | return 1; | ||
| 2453 | return 0; | ||
| 2454 | } | ||
| 2455 | |||
| 2438 | /* | 2456 | /* |
| 2439 | * this starts processing the delayed reference count updates and | 2457 | * this starts processing the delayed reference count updates and |
| 2440 | * extent insertions we have queued up so far. count can be | 2458 | * extent insertions we have queued up so far. count can be |
| @@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, | |||
| 2469 | 2487 | ||
| 2470 | delayed_refs = &trans->transaction->delayed_refs; | 2488 | delayed_refs = &trans->transaction->delayed_refs; |
| 2471 | INIT_LIST_HEAD(&cluster); | 2489 | INIT_LIST_HEAD(&cluster); |
| 2490 | if (count == 0) { | ||
| 2491 | count = delayed_refs->num_entries * 2; | ||
| 2492 | run_most = 1; | ||
| 2493 | } | ||
| 2494 | |||
| 2495 | if (!run_all && !run_most) { | ||
| 2496 | int old; | ||
| 2497 | int seq = atomic_read(&delayed_refs->ref_seq); | ||
| 2498 | |||
| 2499 | progress: | ||
| 2500 | old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); | ||
| 2501 | if (old) { | ||
| 2502 | DEFINE_WAIT(__wait); | ||
| 2503 | if (delayed_refs->num_entries < 16348) | ||
| 2504 | return 0; | ||
| 2505 | |||
| 2506 | prepare_to_wait(&delayed_refs->wait, &__wait, | ||
| 2507 | TASK_UNINTERRUPTIBLE); | ||
| 2508 | |||
| 2509 | old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); | ||
| 2510 | if (old) { | ||
| 2511 | schedule(); | ||
| 2512 | finish_wait(&delayed_refs->wait, &__wait); | ||
| 2513 | |||
| 2514 | if (!refs_newer(delayed_refs, seq, 256)) | ||
| 2515 | goto progress; | ||
| 2516 | else | ||
| 2517 | return 0; | ||
| 2518 | } else { | ||
| 2519 | finish_wait(&delayed_refs->wait, &__wait); | ||
| 2520 | goto again; | ||
| 2521 | } | ||
| 2522 | } | ||
| 2523 | |||
| 2524 | } else { | ||
| 2525 | atomic_inc(&delayed_refs->procs_running_refs); | ||
| 2526 | } | ||
| 2527 | |||
| 2472 | again: | 2528 | again: |
| 2473 | loops = 0; | 2529 | loops = 0; |
| 2474 | spin_lock(&delayed_refs->lock); | 2530 | spin_lock(&delayed_refs->lock); |
| @@ -2477,10 +2533,6 @@ again: | |||
| 2477 | delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); | 2533 | delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); |
| 2478 | #endif | 2534 | #endif |
| 2479 | 2535 | ||
| 2480 | if (count == 0) { | ||
| 2481 | count = delayed_refs->num_entries * 2; | ||
| 2482 | run_most = 1; | ||
| 2483 | } | ||
| 2484 | while (1) { | 2536 | while (1) { |
| 2485 | if (!(run_all || run_most) && | 2537 | if (!(run_all || run_most) && |
| 2486 | delayed_refs->num_heads_ready < 64) | 2538 | delayed_refs->num_heads_ready < 64) |
| @@ -2500,11 +2552,15 @@ again: | |||
| 2500 | 2552 | ||
| 2501 | ret = run_clustered_refs(trans, root, &cluster); | 2553 | ret = run_clustered_refs(trans, root, &cluster); |
| 2502 | if (ret < 0) { | 2554 | if (ret < 0) { |
| 2555 | btrfs_release_ref_cluster(&cluster); | ||
| 2503 | spin_unlock(&delayed_refs->lock); | 2556 | spin_unlock(&delayed_refs->lock); |
| 2504 | btrfs_abort_transaction(trans, root, ret); | 2557 | btrfs_abort_transaction(trans, root, ret); |
| 2558 | atomic_dec(&delayed_refs->procs_running_refs); | ||
| 2505 | return ret; | 2559 | return ret; |
| 2506 | } | 2560 | } |
| 2507 | 2561 | ||
| 2562 | atomic_add(ret, &delayed_refs->ref_seq); | ||
| 2563 | |||
| 2508 | count -= min_t(unsigned long, ret, count); | 2564 | count -= min_t(unsigned long, ret, count); |
| 2509 | 2565 | ||
| 2510 | if (count == 0) | 2566 | if (count == 0) |
| @@ -2573,6 +2629,11 @@ again: | |||
| 2573 | goto again; | 2629 | goto again; |
| 2574 | } | 2630 | } |
| 2575 | out: | 2631 | out: |
| 2632 | atomic_dec(&delayed_refs->procs_running_refs); | ||
| 2633 | smp_mb(); | ||
| 2634 | if (waitqueue_active(&delayed_refs->wait)) | ||
| 2635 | wake_up(&delayed_refs->wait); | ||
| 2636 | |||
| 2576 | spin_unlock(&delayed_refs->lock); | 2637 | spin_unlock(&delayed_refs->lock); |
| 2577 | assert_qgroups_uptodate(trans); | 2638 | assert_qgroups_uptodate(trans); |
| 2578 | return 0; | 2639 | return 0; |
| @@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, | |||
| 2586 | struct btrfs_delayed_extent_op *extent_op; | 2647 | struct btrfs_delayed_extent_op *extent_op; |
| 2587 | int ret; | 2648 | int ret; |
| 2588 | 2649 | ||
| 2589 | extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); | 2650 | extent_op = btrfs_alloc_delayed_extent_op(); |
| 2590 | if (!extent_op) | 2651 | if (!extent_op) |
| 2591 | return -ENOMEM; | 2652 | return -ENOMEM; |
| 2592 | 2653 | ||
| @@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, | |||
| 2598 | ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, | 2659 | ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, |
| 2599 | num_bytes, extent_op); | 2660 | num_bytes, extent_op); |
| 2600 | if (ret) | 2661 | if (ret) |
| 2601 | kfree(extent_op); | 2662 | btrfs_free_delayed_extent_op(extent_op); |
| 2602 | return ret; | 2663 | return ret; |
| 2603 | } | 2664 | } |
| 2604 | 2665 | ||
| @@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | |||
| 3223 | u64 extra_flags = chunk_to_extended(flags) & | 3284 | u64 extra_flags = chunk_to_extended(flags) & |
| 3224 | BTRFS_EXTENDED_PROFILE_MASK; | 3285 | BTRFS_EXTENDED_PROFILE_MASK; |
| 3225 | 3286 | ||
| 3287 | write_seqlock(&fs_info->profiles_lock); | ||
| 3226 | if (flags & BTRFS_BLOCK_GROUP_DATA) | 3288 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
| 3227 | fs_info->avail_data_alloc_bits |= extra_flags; | 3289 | fs_info->avail_data_alloc_bits |= extra_flags; |
| 3228 | if (flags & BTRFS_BLOCK_GROUP_METADATA) | 3290 | if (flags & BTRFS_BLOCK_GROUP_METADATA) |
| 3229 | fs_info->avail_metadata_alloc_bits |= extra_flags; | 3291 | fs_info->avail_metadata_alloc_bits |= extra_flags; |
| 3230 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | 3292 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) |
| 3231 | fs_info->avail_system_alloc_bits |= extra_flags; | 3293 | fs_info->avail_system_alloc_bits |= extra_flags; |
| 3294 | write_sequnlock(&fs_info->profiles_lock); | ||
| 3232 | } | 3295 | } |
| 3233 | 3296 | ||
| 3234 | /* | 3297 | /* |
| @@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
| 3276 | u64 num_devices = root->fs_info->fs_devices->rw_devices + | 3339 | u64 num_devices = root->fs_info->fs_devices->rw_devices + |
| 3277 | root->fs_info->fs_devices->missing_devices; | 3340 | root->fs_info->fs_devices->missing_devices; |
| 3278 | u64 target; | 3341 | u64 target; |
| 3342 | u64 tmp; | ||
| 3279 | 3343 | ||
| 3280 | /* | 3344 | /* |
| 3281 | * see if restripe for this chunk_type is in progress, if so | 3345 | * see if restripe for this chunk_type is in progress, if so |
| @@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) | |||
| 3292 | } | 3356 | } |
| 3293 | spin_unlock(&root->fs_info->balance_lock); | 3357 | spin_unlock(&root->fs_info->balance_lock); |
| 3294 | 3358 | ||
| 3359 | /* First, mask out the RAID levels which aren't possible */ | ||
| 3295 | if (num_devices == 1) | 3360 | if (num_devices == 1) |
| 3296 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); | 3361 | flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | |
| 3362 | BTRFS_BLOCK_GROUP_RAID5); | ||
| 3363 | if (num_devices < 3) | ||
| 3364 | flags &= ~BTRFS_BLOCK_GROUP_RAID6; | ||
| 3297 | if (num_devices < 4) | 3365 | if (num_devices < 4) |
| 3298 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; | 3366 | flags &= ~BTRFS_BLOCK_GROUP_RAID10; |
| 3299 | 3367 | ||
| 3300 | if ((flags & BTRFS_BLOCK_GROUP_DUP) && | 3368 | tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | |
| 3301 | (flags & (BTRFS_BLOCK_GROUP_RAID1 | | 3369 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | |
| 3302 | BTRFS_BLOCK_GROUP_RAID10))) { | 3370 | BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); |
| 3303 | flags &= ~BTRFS_BLOCK_GROUP_DUP; | 3371 | flags &= ~tmp; |
| 3304 | } | ||
| 3305 | |||
| 3306 | if ((flags & BTRFS_BLOCK_GROUP_RAID1) && | ||
| 3307 | (flags & BTRFS_BLOCK_GROUP_RAID10)) { | ||
| 3308 | flags &= ~BTRFS_BLOCK_GROUP_RAID1; | ||
| 3309 | } | ||
| 3310 | 3372 | ||
| 3311 | if ((flags & BTRFS_BLOCK_GROUP_RAID0) && | 3373 | if (tmp & BTRFS_BLOCK_GROUP_RAID6) |
| 3312 | ((flags & BTRFS_BLOCK_GROUP_RAID1) | | 3374 | tmp = BTRFS_BLOCK_GROUP_RAID6; |
| 3313 | (flags & BTRFS_BLOCK_GROUP_RAID10) | | 3375 | else if (tmp & BTRFS_BLOCK_GROUP_RAID5) |
| 3314 | (flags & BTRFS_BLOCK_GROUP_DUP))) { | 3376 | tmp = BTRFS_BLOCK_GROUP_RAID5; |
| 3315 | flags &= ~BTRFS_BLOCK_GROUP_RAID0; | 3377 | else if (tmp & BTRFS_BLOCK_GROUP_RAID10) |
| 3316 | } | 3378 | tmp = BTRFS_BLOCK_GROUP_RAID10; |
| 3379 | else if (tmp & BTRFS_BLOCK_GROUP_RAID1) | ||
| 3380 | tmp = BTRFS_BLOCK_GROUP_RAID1; | ||
| 3381 | else if (tmp & BTRFS_BLOCK_GROUP_RAID0) | ||
| 3382 | tmp = BTRFS_BLOCK_GROUP_RAID0; | ||
| 3317 | 3383 | ||
| 3318 | return extended_to_chunk(flags); | 3384 | return extended_to_chunk(flags | tmp); |
| 3319 | } | 3385 | } |
| 3320 | 3386 | ||
| 3321 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | 3387 | static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) |
| 3322 | { | 3388 | { |
| 3323 | if (flags & BTRFS_BLOCK_GROUP_DATA) | 3389 | unsigned seq; |
| 3324 | flags |= root->fs_info->avail_data_alloc_bits; | 3390 | |
| 3325 | else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | 3391 | do { |
| 3326 | flags |= root->fs_info->avail_system_alloc_bits; | 3392 | seq = read_seqbegin(&root->fs_info->profiles_lock); |
| 3327 | else if (flags & BTRFS_BLOCK_GROUP_METADATA) | 3393 | |
| 3328 | flags |= root->fs_info->avail_metadata_alloc_bits; | 3394 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
| 3395 | flags |= root->fs_info->avail_data_alloc_bits; | ||
| 3396 | else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | ||
| 3397 | flags |= root->fs_info->avail_system_alloc_bits; | ||
| 3398 | else if (flags & BTRFS_BLOCK_GROUP_METADATA) | ||
| 3399 | flags |= root->fs_info->avail_metadata_alloc_bits; | ||
| 3400 | } while (read_seqretry(&root->fs_info->profiles_lock, seq)); | ||
| 3329 | 3401 | ||
| 3330 | return btrfs_reduce_alloc_profile(root, flags); | 3402 | return btrfs_reduce_alloc_profile(root, flags); |
| 3331 | } | 3403 | } |
| @@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags) | |||
| 3333 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) | 3405 | u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) |
| 3334 | { | 3406 | { |
| 3335 | u64 flags; | 3407 | u64 flags; |
| 3408 | u64 ret; | ||
| 3336 | 3409 | ||
| 3337 | if (data) | 3410 | if (data) |
| 3338 | flags = BTRFS_BLOCK_GROUP_DATA; | 3411 | flags = BTRFS_BLOCK_GROUP_DATA; |
| @@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) | |||
| 3341 | else | 3414 | else |
| 3342 | flags = BTRFS_BLOCK_GROUP_METADATA; | 3415 | flags = BTRFS_BLOCK_GROUP_METADATA; |
| 3343 | 3416 | ||
| 3344 | return get_alloc_profile(root, flags); | 3417 | ret = get_alloc_profile(root, flags); |
| 3418 | return ret; | ||
| 3345 | } | 3419 | } |
| 3346 | 3420 | ||
| 3347 | /* | 3421 | /* |
| @@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes) | |||
| 3357 | int ret = 0, committed = 0, alloc_chunk = 1; | 3431 | int ret = 0, committed = 0, alloc_chunk = 1; |
| 3358 | 3432 | ||
| 3359 | /* make sure bytes are sectorsize aligned */ | 3433 | /* make sure bytes are sectorsize aligned */ |
| 3360 | bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); | 3434 | bytes = ALIGN(bytes, root->sectorsize); |
| 3361 | 3435 | ||
| 3362 | if (root == root->fs_info->tree_root || | 3436 | if (root == root->fs_info->tree_root || |
| 3363 | BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { | 3437 | BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) { |
| @@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) | |||
| 3452 | struct btrfs_space_info *data_sinfo; | 3526 | struct btrfs_space_info *data_sinfo; |
| 3453 | 3527 | ||
| 3454 | /* make sure bytes are sectorsize aligned */ | 3528 | /* make sure bytes are sectorsize aligned */ |
| 3455 | bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); | 3529 | bytes = ALIGN(bytes, root->sectorsize); |
| 3456 | 3530 | ||
| 3457 | data_sinfo = root->fs_info->data_sinfo; | 3531 | data_sinfo = root->fs_info->data_sinfo; |
| 3458 | spin_lock(&data_sinfo->lock); | 3532 | spin_lock(&data_sinfo->lock); |
| @@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) | |||
| 3516 | { | 3590 | { |
| 3517 | u64 num_dev; | 3591 | u64 num_dev; |
| 3518 | 3592 | ||
| 3519 | if (type & BTRFS_BLOCK_GROUP_RAID10 || | 3593 | if (type & (BTRFS_BLOCK_GROUP_RAID10 | |
| 3520 | type & BTRFS_BLOCK_GROUP_RAID0) | 3594 | BTRFS_BLOCK_GROUP_RAID0 | |
| 3595 | BTRFS_BLOCK_GROUP_RAID5 | | ||
| 3596 | BTRFS_BLOCK_GROUP_RAID6)) | ||
| 3521 | num_dev = root->fs_info->fs_devices->rw_devices; | 3597 | num_dev = root->fs_info->fs_devices->rw_devices; |
| 3522 | else if (type & BTRFS_BLOCK_GROUP_RAID1) | 3598 | else if (type & BTRFS_BLOCK_GROUP_RAID1) |
| 3523 | num_dev = 2; | 3599 | num_dev = 2; |
| @@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, | |||
| 3564 | int wait_for_alloc = 0; | 3640 | int wait_for_alloc = 0; |
| 3565 | int ret = 0; | 3641 | int ret = 0; |
| 3566 | 3642 | ||
| 3643 | /* Don't re-enter if we're already allocating a chunk */ | ||
| 3644 | if (trans->allocating_chunk) | ||
| 3645 | return -ENOSPC; | ||
| 3646 | |||
| 3567 | space_info = __find_space_info(extent_root->fs_info, flags); | 3647 | space_info = __find_space_info(extent_root->fs_info, flags); |
| 3568 | if (!space_info) { | 3648 | if (!space_info) { |
| 3569 | ret = update_space_info(extent_root->fs_info, flags, | 3649 | ret = update_space_info(extent_root->fs_info, flags, |
| @@ -3606,6 +3686,8 @@ again: | |||
| 3606 | goto again; | 3686 | goto again; |
| 3607 | } | 3687 | } |
| 3608 | 3688 | ||
| 3689 | trans->allocating_chunk = true; | ||
| 3690 | |||
| 3609 | /* | 3691 | /* |
| 3610 | * If we have mixed data/metadata chunks we want to make sure we keep | 3692 | * If we have mixed data/metadata chunks we want to make sure we keep |
| 3611 | * allocating mixed chunks instead of individual chunks. | 3693 | * allocating mixed chunks instead of individual chunks. |
| @@ -3632,19 +3714,20 @@ again: | |||
| 3632 | check_system_chunk(trans, extent_root, flags); | 3714 | check_system_chunk(trans, extent_root, flags); |
| 3633 | 3715 | ||
| 3634 | ret = btrfs_alloc_chunk(trans, extent_root, flags); | 3716 | ret = btrfs_alloc_chunk(trans, extent_root, flags); |
| 3635 | if (ret < 0 && ret != -ENOSPC) | 3717 | trans->allocating_chunk = false; |
| 3636 | goto out; | ||
| 3637 | 3718 | ||
| 3638 | spin_lock(&space_info->lock); | 3719 | spin_lock(&space_info->lock); |
| 3720 | if (ret < 0 && ret != -ENOSPC) | ||
| 3721 | goto out; | ||
| 3639 | if (ret) | 3722 | if (ret) |
| 3640 | space_info->full = 1; | 3723 | space_info->full = 1; |
| 3641 | else | 3724 | else |
| 3642 | ret = 1; | 3725 | ret = 1; |
| 3643 | 3726 | ||
| 3644 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; | 3727 | space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; |
| 3728 | out: | ||
| 3645 | space_info->chunk_alloc = 0; | 3729 | space_info->chunk_alloc = 0; |
| 3646 | spin_unlock(&space_info->lock); | 3730 | spin_unlock(&space_info->lock); |
| 3647 | out: | ||
| 3648 | mutex_unlock(&fs_info->chunk_mutex); | 3731 | mutex_unlock(&fs_info->chunk_mutex); |
| 3649 | return ret; | 3732 | return ret; |
| 3650 | } | 3733 | } |
| @@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root, | |||
| 3653 | struct btrfs_space_info *space_info, u64 bytes, | 3736 | struct btrfs_space_info *space_info, u64 bytes, |
| 3654 | enum btrfs_reserve_flush_enum flush) | 3737 | enum btrfs_reserve_flush_enum flush) |
| 3655 | { | 3738 | { |
| 3739 | struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; | ||
| 3656 | u64 profile = btrfs_get_alloc_profile(root, 0); | 3740 | u64 profile = btrfs_get_alloc_profile(root, 0); |
| 3741 | u64 rsv_size = 0; | ||
| 3657 | u64 avail; | 3742 | u64 avail; |
| 3658 | u64 used; | 3743 | u64 used; |
| 3744 | u64 to_add; | ||
| 3659 | 3745 | ||
| 3660 | used = space_info->bytes_used + space_info->bytes_reserved + | 3746 | used = space_info->bytes_used + space_info->bytes_reserved + |
| 3661 | space_info->bytes_pinned + space_info->bytes_readonly + | 3747 | space_info->bytes_pinned + space_info->bytes_readonly; |
| 3662 | space_info->bytes_may_use; | 3748 | |
| 3749 | spin_lock(&global_rsv->lock); | ||
| 3750 | rsv_size = global_rsv->size; | ||
| 3751 | spin_unlock(&global_rsv->lock); | ||
| 3752 | |||
| 3753 | /* | ||
| 3754 | * We only want to allow over committing if we have lots of actual space | ||
| 3755 | * free, but if we don't have enough space to handle the global reserve | ||
| 3756 | * space then we could end up having a real enospc problem when trying | ||
| 3757 | * to allocate a chunk or some other such important allocation. | ||
| 3758 | */ | ||
| 3759 | rsv_size <<= 1; | ||
| 3760 | if (used + rsv_size >= space_info->total_bytes) | ||
| 3761 | return 0; | ||
| 3762 | |||
| 3763 | used += space_info->bytes_may_use; | ||
| 3663 | 3764 | ||
| 3664 | spin_lock(&root->fs_info->free_chunk_lock); | 3765 | spin_lock(&root->fs_info->free_chunk_lock); |
| 3665 | avail = root->fs_info->free_chunk_space; | 3766 | avail = root->fs_info->free_chunk_space; |
| @@ -3667,28 +3768,60 @@ static int can_overcommit(struct btrfs_root *root, | |||
| 3667 | 3768 | ||
| 3668 | /* | 3769 | /* |
| 3669 | * If we have dup, raid1 or raid10 then only half of the free | 3770 | * If we have dup, raid1 or raid10 then only half of the free |
| 3670 | * space is actually useable. | 3771 | * space is actually useable. For raid56, the space info used |
| 3772 | * doesn't include the parity drive, so we don't have to | ||
| 3773 | * change the math | ||
| 3671 | */ | 3774 | */ |
| 3672 | if (profile & (BTRFS_BLOCK_GROUP_DUP | | 3775 | if (profile & (BTRFS_BLOCK_GROUP_DUP | |
| 3673 | BTRFS_BLOCK_GROUP_RAID1 | | 3776 | BTRFS_BLOCK_GROUP_RAID1 | |
| 3674 | BTRFS_BLOCK_GROUP_RAID10)) | 3777 | BTRFS_BLOCK_GROUP_RAID10)) |
| 3675 | avail >>= 1; | 3778 | avail >>= 1; |
| 3676 | 3779 | ||
| 3780 | to_add = space_info->total_bytes; | ||
| 3781 | |||
| 3677 | /* | 3782 | /* |
| 3678 | * If we aren't flushing all things, let us overcommit up to | 3783 | * If we aren't flushing all things, let us overcommit up to |
| 3679 | * 1/2th of the space. If we can flush, don't let us overcommit | 3784 | * 1/2th of the space. If we can flush, don't let us overcommit |
| 3680 | * too much, let it overcommit up to 1/8 of the space. | 3785 | * too much, let it overcommit up to 1/8 of the space. |
| 3681 | */ | 3786 | */ |
| 3682 | if (flush == BTRFS_RESERVE_FLUSH_ALL) | 3787 | if (flush == BTRFS_RESERVE_FLUSH_ALL) |
| 3683 | avail >>= 3; | 3788 | to_add >>= 3; |
| 3684 | else | 3789 | else |
| 3685 | avail >>= 1; | 3790 | to_add >>= 1; |
| 3686 | 3791 | ||
| 3687 | if (used + bytes < space_info->total_bytes + avail) | 3792 | /* |
| 3793 | * Limit the overcommit to the amount of free space we could possibly | ||
| 3794 | * allocate for chunks. | ||
| 3795 | */ | ||
| 3796 | to_add = min(avail, to_add); | ||
| 3797 | |||
| 3798 | if (used + bytes < space_info->total_bytes + to_add) | ||
| 3688 | return 1; | 3799 | return 1; |
| 3689 | return 0; | 3800 | return 0; |
| 3690 | } | 3801 | } |
| 3691 | 3802 | ||
| 3803 | void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root, | ||
| 3804 | unsigned long nr_pages) | ||
| 3805 | { | ||
| 3806 | struct super_block *sb = root->fs_info->sb; | ||
| 3807 | int started; | ||
| 3808 | |||
| 3809 | /* If we can not start writeback, just sync all the delalloc file. */ | ||
| 3810 | started = try_to_writeback_inodes_sb_nr(sb, nr_pages, | ||
| 3811 | WB_REASON_FS_FREE_SPACE); | ||
| 3812 | if (!started) { | ||
| 3813 | /* | ||
| 3814 | * We needn't worry the filesystem going from r/w to r/o though | ||
| 3815 | * we don't acquire ->s_umount mutex, because the filesystem | ||
| 3816 | * should guarantee the delalloc inodes list be empty after | ||
| 3817 | * the filesystem is readonly(all dirty pages are written to | ||
| 3818 | * the disk). | ||
| 3819 | */ | ||
| 3820 | btrfs_start_delalloc_inodes(root, 0); | ||
| 3821 | btrfs_wait_ordered_extents(root, 0); | ||
| 3822 | } | ||
| 3823 | } | ||
| 3824 | |||
| 3692 | /* | 3825 | /* |
| 3693 | * shrink metadata reservation for delalloc | 3826 | * shrink metadata reservation for delalloc |
| 3694 | */ | 3827 | */ |
| @@ -3710,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
| 3710 | space_info = block_rsv->space_info; | 3843 | space_info = block_rsv->space_info; |
| 3711 | 3844 | ||
| 3712 | smp_mb(); | 3845 | smp_mb(); |
| 3713 | delalloc_bytes = root->fs_info->delalloc_bytes; | 3846 | delalloc_bytes = percpu_counter_sum_positive( |
| 3847 | &root->fs_info->delalloc_bytes); | ||
| 3714 | if (delalloc_bytes == 0) { | 3848 | if (delalloc_bytes == 0) { |
| 3715 | if (trans) | 3849 | if (trans) |
| 3716 | return; | 3850 | return; |
| @@ -3721,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
| 3721 | while (delalloc_bytes && loops < 3) { | 3855 | while (delalloc_bytes && loops < 3) { |
| 3722 | max_reclaim = min(delalloc_bytes, to_reclaim); | 3856 | max_reclaim = min(delalloc_bytes, to_reclaim); |
| 3723 | nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; | 3857 | nr_pages = max_reclaim >> PAGE_CACHE_SHIFT; |
| 3724 | try_to_writeback_inodes_sb_nr(root->fs_info->sb, | 3858 | btrfs_writeback_inodes_sb_nr(root, nr_pages); |
| 3725 | nr_pages, | ||
| 3726 | WB_REASON_FS_FREE_SPACE); | ||
| 3727 | |||
| 3728 | /* | 3859 | /* |
| 3729 | * We need to wait for the async pages to actually start before | 3860 | * We need to wait for the async pages to actually start before |
| 3730 | * we do anything. | 3861 | * we do anything. |
| @@ -3752,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig, | |||
| 3752 | break; | 3883 | break; |
| 3753 | } | 3884 | } |
| 3754 | smp_mb(); | 3885 | smp_mb(); |
| 3755 | delalloc_bytes = root->fs_info->delalloc_bytes; | 3886 | delalloc_bytes = percpu_counter_sum_positive( |
| 3887 | &root->fs_info->delalloc_bytes); | ||
| 3756 | } | 3888 | } |
| 3757 | } | 3889 | } |
| 3758 | 3890 | ||
| @@ -4016,6 +4148,15 @@ again: | |||
| 4016 | goto again; | 4148 | goto again; |
| 4017 | 4149 | ||
| 4018 | out: | 4150 | out: |
| 4151 | if (ret == -ENOSPC && | ||
| 4152 | unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { | ||
| 4153 | struct btrfs_block_rsv *global_rsv = | ||
| 4154 | &root->fs_info->global_block_rsv; | ||
| 4155 | |||
| 4156 | if (block_rsv != global_rsv && | ||
| 4157 | !block_rsv_use_bytes(global_rsv, orig_bytes)) | ||
| 4158 | ret = 0; | ||
| 4159 | } | ||
| 4019 | if (flushing) { | 4160 | if (flushing) { |
| 4020 | spin_lock(&space_info->lock); | 4161 | spin_lock(&space_info->lock); |
| 4021 | space_info->flush = 0; | 4162 | space_info->flush = 0; |
| @@ -4402,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode) | |||
| 4402 | btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); | 4543 | btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); |
| 4403 | } | 4544 | } |
| 4404 | 4545 | ||
| 4405 | int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans, | 4546 | /* |
| 4406 | struct btrfs_pending_snapshot *pending) | 4547 | * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation |
| 4548 | * root: the root of the parent directory | ||
| 4549 | * rsv: block reservation | ||
| 4550 | * items: the number of items that we need do reservation | ||
| 4551 | * qgroup_reserved: used to return the reserved size in qgroup | ||
| 4552 | * | ||
| 4553 | * This function is used to reserve the space for snapshot/subvolume | ||
| 4554 | * creation and deletion. Those operations are different with the | ||
| 4555 | * common file/directory operations, they change two fs/file trees | ||
| 4556 | * and root tree, the number of items that the qgroup reserves is | ||
| 4557 | * different with the free space reservation. So we can not use | ||
| 4558 | * the space reseravtion mechanism in start_transaction(). | ||
| 4559 | */ | ||
| 4560 | int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, | ||
| 4561 | struct btrfs_block_rsv *rsv, | ||
| 4562 | int items, | ||
| 4563 | u64 *qgroup_reserved) | ||
| 4407 | { | 4564 | { |
| 4408 | struct btrfs_root *root = pending->root; | 4565 | u64 num_bytes; |
| 4409 | struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root); | 4566 | int ret; |
| 4410 | struct btrfs_block_rsv *dst_rsv = &pending->block_rsv; | 4567 | |
| 4411 | /* | 4568 | if (root->fs_info->quota_enabled) { |
| 4412 | * two for root back/forward refs, two for directory entries, | 4569 | /* One for parent inode, two for dir entries */ |
| 4413 | * one for root of the snapshot and one for parent inode. | 4570 | num_bytes = 3 * root->leafsize; |
| 4414 | */ | 4571 | ret = btrfs_qgroup_reserve(root, num_bytes); |
| 4415 | u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6); | 4572 | if (ret) |
| 4416 | dst_rsv->space_info = src_rsv->space_info; | 4573 | return ret; |
| 4417 | return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); | 4574 | } else { |
| 4575 | num_bytes = 0; | ||
| 4576 | } | ||
| 4577 | |||
| 4578 | *qgroup_reserved = num_bytes; | ||
| 4579 | |||
| 4580 | num_bytes = btrfs_calc_trans_metadata_size(root, items); | ||
| 4581 | rsv->space_info = __find_space_info(root->fs_info, | ||
| 4582 | BTRFS_BLOCK_GROUP_METADATA); | ||
| 4583 | ret = btrfs_block_rsv_add(root, rsv, num_bytes, | ||
| 4584 | BTRFS_RESERVE_FLUSH_ALL); | ||
| 4585 | if (ret) { | ||
| 4586 | if (*qgroup_reserved) | ||
| 4587 | btrfs_qgroup_free(root, *qgroup_reserved); | ||
| 4588 | } | ||
| 4589 | |||
| 4590 | return ret; | ||
| 4591 | } | ||
| 4592 | |||
| 4593 | void btrfs_subvolume_release_metadata(struct btrfs_root *root, | ||
| 4594 | struct btrfs_block_rsv *rsv, | ||
| 4595 | u64 qgroup_reserved) | ||
| 4596 | { | ||
| 4597 | btrfs_block_rsv_release(root, rsv, (u64)-1); | ||
| 4598 | if (qgroup_reserved) | ||
| 4599 | btrfs_qgroup_free(root, qgroup_reserved); | ||
| 4418 | } | 4600 | } |
| 4419 | 4601 | ||
| 4420 | /** | 4602 | /** |
| @@ -4522,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
| 4522 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; | 4704 | enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL; |
| 4523 | int ret = 0; | 4705 | int ret = 0; |
| 4524 | bool delalloc_lock = true; | 4706 | bool delalloc_lock = true; |
| 4707 | u64 to_free = 0; | ||
| 4708 | unsigned dropped; | ||
| 4525 | 4709 | ||
| 4526 | /* If we are a free space inode we need to not flush since we will be in | 4710 | /* If we are a free space inode we need to not flush since we will be in |
| 4527 | * the middle of a transaction commit. We also don't need the delalloc | 4711 | * the middle of a transaction commit. We also don't need the delalloc |
| @@ -4565,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
| 4565 | csum_bytes = BTRFS_I(inode)->csum_bytes; | 4749 | csum_bytes = BTRFS_I(inode)->csum_bytes; |
| 4566 | spin_unlock(&BTRFS_I(inode)->lock); | 4750 | spin_unlock(&BTRFS_I(inode)->lock); |
| 4567 | 4751 | ||
| 4568 | if (root->fs_info->quota_enabled) | 4752 | if (root->fs_info->quota_enabled) { |
| 4569 | ret = btrfs_qgroup_reserve(root, num_bytes + | 4753 | ret = btrfs_qgroup_reserve(root, num_bytes + |
| 4570 | nr_extents * root->leafsize); | 4754 | nr_extents * root->leafsize); |
| 4755 | if (ret) | ||
| 4756 | goto out_fail; | ||
| 4757 | } | ||
| 4571 | 4758 | ||
| 4572 | /* | 4759 | ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); |
| 4573 | * ret != 0 here means the qgroup reservation failed, we go straight to | 4760 | if (unlikely(ret)) { |
| 4574 | * the shared error handling then. | 4761 | if (root->fs_info->quota_enabled) |
| 4575 | */ | ||
| 4576 | if (ret == 0) | ||
| 4577 | ret = reserve_metadata_bytes(root, block_rsv, | ||
| 4578 | to_reserve, flush); | ||
| 4579 | |||
| 4580 | if (ret) { | ||
| 4581 | u64 to_free = 0; | ||
| 4582 | unsigned dropped; | ||
| 4583 | |||
| 4584 | spin_lock(&BTRFS_I(inode)->lock); | ||
| 4585 | dropped = drop_outstanding_extent(inode); | ||
| 4586 | /* | ||
| 4587 | * If the inodes csum_bytes is the same as the original | ||
| 4588 | * csum_bytes then we know we haven't raced with any free()ers | ||
| 4589 | * so we can just reduce our inodes csum bytes and carry on. | ||
| 4590 | * Otherwise we have to do the normal free thing to account for | ||
| 4591 | * the case that the free side didn't free up its reserve | ||
| 4592 | * because of this outstanding reservation. | ||
| 4593 | */ | ||
| 4594 | if (BTRFS_I(inode)->csum_bytes == csum_bytes) | ||
| 4595 | calc_csum_metadata_size(inode, num_bytes, 0); | ||
| 4596 | else | ||
| 4597 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
| 4598 | spin_unlock(&BTRFS_I(inode)->lock); | ||
| 4599 | if (dropped) | ||
| 4600 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
| 4601 | |||
| 4602 | if (to_free) { | ||
| 4603 | btrfs_block_rsv_release(root, block_rsv, to_free); | ||
| 4604 | trace_btrfs_space_reservation(root->fs_info, | ||
| 4605 | "delalloc", | ||
| 4606 | btrfs_ino(inode), | ||
| 4607 | to_free, 0); | ||
| 4608 | } | ||
| 4609 | if (root->fs_info->quota_enabled) { | ||
| 4610 | btrfs_qgroup_free(root, num_bytes + | 4762 | btrfs_qgroup_free(root, num_bytes + |
| 4611 | nr_extents * root->leafsize); | 4763 | nr_extents * root->leafsize); |
| 4612 | } | 4764 | goto out_fail; |
| 4613 | if (delalloc_lock) | ||
| 4614 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
| 4615 | return ret; | ||
| 4616 | } | 4765 | } |
| 4617 | 4766 | ||
| 4618 | spin_lock(&BTRFS_I(inode)->lock); | 4767 | spin_lock(&BTRFS_I(inode)->lock); |
| @@ -4633,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) | |||
| 4633 | block_rsv_add_bytes(block_rsv, to_reserve, 1); | 4782 | block_rsv_add_bytes(block_rsv, to_reserve, 1); |
| 4634 | 4783 | ||
| 4635 | return 0; | 4784 | return 0; |
| 4785 | |||
| 4786 | out_fail: | ||
| 4787 | spin_lock(&BTRFS_I(inode)->lock); | ||
| 4788 | dropped = drop_outstanding_extent(inode); | ||
| 4789 | /* | ||
| 4790 | * If the inodes csum_bytes is the same as the original | ||
| 4791 | * csum_bytes then we know we haven't raced with any free()ers | ||
| 4792 | * so we can just reduce our inodes csum bytes and carry on. | ||
| 4793 | * Otherwise we have to do the normal free thing to account for | ||
| 4794 | * the case that the free side didn't free up its reserve | ||
| 4795 | * because of this outstanding reservation. | ||
| 4796 | */ | ||
| 4797 | if (BTRFS_I(inode)->csum_bytes == csum_bytes) | ||
| 4798 | calc_csum_metadata_size(inode, num_bytes, 0); | ||
| 4799 | else | ||
| 4800 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
| 4801 | spin_unlock(&BTRFS_I(inode)->lock); | ||
| 4802 | if (dropped) | ||
| 4803 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | ||
| 4804 | |||
| 4805 | if (to_free) { | ||
| 4806 | btrfs_block_rsv_release(root, block_rsv, to_free); | ||
| 4807 | trace_btrfs_space_reservation(root->fs_info, "delalloc", | ||
| 4808 | btrfs_ino(inode), to_free, 0); | ||
| 4809 | } | ||
| 4810 | if (delalloc_lock) | ||
| 4811 | mutex_unlock(&BTRFS_I(inode)->delalloc_mutex); | ||
| 4812 | return ret; | ||
| 4636 | } | 4813 | } |
| 4637 | 4814 | ||
| 4638 | /** | 4815 | /** |
| @@ -4654,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) | |||
| 4654 | spin_lock(&BTRFS_I(inode)->lock); | 4831 | spin_lock(&BTRFS_I(inode)->lock); |
| 4655 | dropped = drop_outstanding_extent(inode); | 4832 | dropped = drop_outstanding_extent(inode); |
| 4656 | 4833 | ||
| 4657 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | 4834 | if (num_bytes) |
| 4835 | to_free = calc_csum_metadata_size(inode, num_bytes, 0); | ||
| 4658 | spin_unlock(&BTRFS_I(inode)->lock); | 4836 | spin_unlock(&BTRFS_I(inode)->lock); |
| 4659 | if (dropped > 0) | 4837 | if (dropped > 0) |
| 4660 | to_free += btrfs_calc_trans_metadata_size(root, dropped); | 4838 | to_free += btrfs_calc_trans_metadata_size(root, dropped); |
| @@ -4721,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) | |||
| 4721 | btrfs_free_reserved_data_space(inode, num_bytes); | 4899 | btrfs_free_reserved_data_space(inode, num_bytes); |
| 4722 | } | 4900 | } |
| 4723 | 4901 | ||
| 4724 | static int update_block_group(struct btrfs_trans_handle *trans, | 4902 | static int update_block_group(struct btrfs_root *root, |
| 4725 | struct btrfs_root *root, | ||
| 4726 | u64 bytenr, u64 num_bytes, int alloc) | 4903 | u64 bytenr, u64 num_bytes, int alloc) |
| 4727 | { | 4904 | { |
| 4728 | struct btrfs_block_group_cache *cache = NULL; | 4905 | struct btrfs_block_group_cache *cache = NULL; |
| @@ -4759,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans, | |||
| 4759 | * space back to the block group, otherwise we will leak space. | 4936 | * space back to the block group, otherwise we will leak space. |
| 4760 | */ | 4937 | */ |
| 4761 | if (!alloc && cache->cached == BTRFS_CACHE_NO) | 4938 | if (!alloc && cache->cached == BTRFS_CACHE_NO) |
| 4762 | cache_block_group(cache, trans, NULL, 1); | 4939 | cache_block_group(cache, 1); |
| 4763 | 4940 | ||
| 4764 | byte_in_group = bytenr - cache->key.objectid; | 4941 | byte_in_group = bytenr - cache->key.objectid; |
| 4765 | WARN_ON(byte_in_group > cache->key.offset); | 4942 | WARN_ON(byte_in_group > cache->key.offset); |
| @@ -4809,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) | |||
| 4809 | struct btrfs_block_group_cache *cache; | 4986 | struct btrfs_block_group_cache *cache; |
| 4810 | u64 bytenr; | 4987 | u64 bytenr; |
| 4811 | 4988 | ||
| 4989 | spin_lock(&root->fs_info->block_group_cache_lock); | ||
| 4990 | bytenr = root->fs_info->first_logical_byte; | ||
| 4991 | spin_unlock(&root->fs_info->block_group_cache_lock); | ||
| 4992 | |||
| 4993 | if (bytenr < (u64)-1) | ||
| 4994 | return bytenr; | ||
| 4995 | |||
| 4812 | cache = btrfs_lookup_first_block_group(root->fs_info, search_start); | 4996 | cache = btrfs_lookup_first_block_group(root->fs_info, search_start); |
| 4813 | if (!cache) | 4997 | if (!cache) |
| 4814 | return 0; | 4998 | return 0; |
| @@ -4859,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root, | |||
| 4859 | /* | 5043 | /* |
| 4860 | * this function must be called within transaction | 5044 | * this function must be called within transaction |
| 4861 | */ | 5045 | */ |
| 4862 | int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, | 5046 | int btrfs_pin_extent_for_log_replay(struct btrfs_root *root, |
| 4863 | struct btrfs_root *root, | ||
| 4864 | u64 bytenr, u64 num_bytes) | 5047 | u64 bytenr, u64 num_bytes) |
| 4865 | { | 5048 | { |
| 4866 | struct btrfs_block_group_cache *cache; | 5049 | struct btrfs_block_group_cache *cache; |
| @@ -4874,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, | |||
| 4874 | * to one because the slow code to read in the free extents does check | 5057 | * to one because the slow code to read in the free extents does check |
| 4875 | * the pinned extents. | 5058 | * the pinned extents. |
| 4876 | */ | 5059 | */ |
| 4877 | cache_block_group(cache, trans, root, 1); | 5060 | cache_block_group(cache, 1); |
| 4878 | 5061 | ||
| 4879 | pin_down_extent(root, cache, bytenr, num_bytes, 0); | 5062 | pin_down_extent(root, cache, bytenr, num_bytes, 0); |
| 4880 | 5063 | ||
| @@ -5271,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, | |||
| 5271 | } | 5454 | } |
| 5272 | } | 5455 | } |
| 5273 | 5456 | ||
| 5274 | ret = update_block_group(trans, root, bytenr, num_bytes, 0); | 5457 | ret = update_block_group(root, bytenr, num_bytes, 0); |
| 5275 | if (ret) { | 5458 | if (ret) { |
| 5276 | btrfs_abort_transaction(trans, extent_root, ret); | 5459 | btrfs_abort_transaction(trans, extent_root, ret); |
| 5277 | goto out; | 5460 | goto out; |
| @@ -5316,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, | |||
| 5316 | if (head->extent_op) { | 5499 | if (head->extent_op) { |
| 5317 | if (!head->must_insert_reserved) | 5500 | if (!head->must_insert_reserved) |
| 5318 | goto out; | 5501 | goto out; |
| 5319 | kfree(head->extent_op); | 5502 | btrfs_free_delayed_extent_op(head->extent_op); |
| 5320 | head->extent_op = NULL; | 5503 | head->extent_op = NULL; |
| 5321 | } | 5504 | } |
| 5322 | 5505 | ||
| @@ -5439,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, | |||
| 5439 | return ret; | 5622 | return ret; |
| 5440 | } | 5623 | } |
| 5441 | 5624 | ||
| 5442 | static u64 stripe_align(struct btrfs_root *root, u64 val) | 5625 | static u64 stripe_align(struct btrfs_root *root, |
| 5626 | struct btrfs_block_group_cache *cache, | ||
| 5627 | u64 val, u64 num_bytes) | ||
| 5443 | { | 5628 | { |
| 5444 | u64 mask = ((u64)root->stripesize - 1); | 5629 | u64 ret = ALIGN(val, root->stripesize); |
| 5445 | u64 ret = (val + mask) & ~mask; | ||
| 5446 | return ret; | 5630 | return ret; |
| 5447 | } | 5631 | } |
| 5448 | 5632 | ||
| @@ -5462,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, | |||
| 5462 | u64 num_bytes) | 5646 | u64 num_bytes) |
| 5463 | { | 5647 | { |
| 5464 | struct btrfs_caching_control *caching_ctl; | 5648 | struct btrfs_caching_control *caching_ctl; |
| 5465 | DEFINE_WAIT(wait); | ||
| 5466 | 5649 | ||
| 5467 | caching_ctl = get_caching_control(cache); | 5650 | caching_ctl = get_caching_control(cache); |
| 5468 | if (!caching_ctl) | 5651 | if (!caching_ctl) |
| @@ -5479,7 +5662,6 @@ static noinline int | |||
| 5479 | wait_block_group_cache_done(struct btrfs_block_group_cache *cache) | 5662 | wait_block_group_cache_done(struct btrfs_block_group_cache *cache) |
| 5480 | { | 5663 | { |
| 5481 | struct btrfs_caching_control *caching_ctl; | 5664 | struct btrfs_caching_control *caching_ctl; |
| 5482 | DEFINE_WAIT(wait); | ||
| 5483 | 5665 | ||
| 5484 | caching_ctl = get_caching_control(cache); | 5666 | caching_ctl = get_caching_control(cache); |
| 5485 | if (!caching_ctl) | 5667 | if (!caching_ctl) |
| @@ -5493,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache) | |||
| 5493 | 5675 | ||
| 5494 | int __get_raid_index(u64 flags) | 5676 | int __get_raid_index(u64 flags) |
| 5495 | { | 5677 | { |
| 5496 | int index; | ||
| 5497 | |||
| 5498 | if (flags & BTRFS_BLOCK_GROUP_RAID10) | 5678 | if (flags & BTRFS_BLOCK_GROUP_RAID10) |
| 5499 | index = 0; | 5679 | return BTRFS_RAID_RAID10; |
| 5500 | else if (flags & BTRFS_BLOCK_GROUP_RAID1) | 5680 | else if (flags & BTRFS_BLOCK_GROUP_RAID1) |
| 5501 | index = 1; | 5681 | return BTRFS_RAID_RAID1; |
| 5502 | else if (flags & BTRFS_BLOCK_GROUP_DUP) | 5682 | else if (flags & BTRFS_BLOCK_GROUP_DUP) |
| 5503 | index = 2; | 5683 | return BTRFS_RAID_DUP; |
| 5504 | else if (flags & BTRFS_BLOCK_GROUP_RAID0) | 5684 | else if (flags & BTRFS_BLOCK_GROUP_RAID0) |
| 5505 | index = 3; | 5685 | return BTRFS_RAID_RAID0; |
| 5506 | else | 5686 | else if (flags & BTRFS_BLOCK_GROUP_RAID5) |
| 5507 | index = 4; | 5687 | return BTRFS_RAID_RAID5; |
| 5688 | else if (flags & BTRFS_BLOCK_GROUP_RAID6) | ||
| 5689 | return BTRFS_RAID_RAID6; | ||
| 5508 | 5690 | ||
| 5509 | return index; | 5691 | return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ |
| 5510 | } | 5692 | } |
| 5511 | 5693 | ||
| 5512 | static int get_block_group_index(struct btrfs_block_group_cache *cache) | 5694 | static int get_block_group_index(struct btrfs_block_group_cache *cache) |
| @@ -5649,6 +5831,8 @@ search: | |||
| 5649 | if (!block_group_bits(block_group, data)) { | 5831 | if (!block_group_bits(block_group, data)) { |
| 5650 | u64 extra = BTRFS_BLOCK_GROUP_DUP | | 5832 | u64 extra = BTRFS_BLOCK_GROUP_DUP | |
| 5651 | BTRFS_BLOCK_GROUP_RAID1 | | 5833 | BTRFS_BLOCK_GROUP_RAID1 | |
| 5834 | BTRFS_BLOCK_GROUP_RAID5 | | ||
| 5835 | BTRFS_BLOCK_GROUP_RAID6 | | ||
| 5652 | BTRFS_BLOCK_GROUP_RAID10; | 5836 | BTRFS_BLOCK_GROUP_RAID10; |
| 5653 | 5837 | ||
| 5654 | /* | 5838 | /* |
| @@ -5664,8 +5848,7 @@ have_block_group: | |||
| 5664 | cached = block_group_cache_done(block_group); | 5848 | cached = block_group_cache_done(block_group); |
| 5665 | if (unlikely(!cached)) { | 5849 | if (unlikely(!cached)) { |
| 5666 | found_uncached_bg = true; | 5850 | found_uncached_bg = true; |
| 5667 | ret = cache_block_group(block_group, trans, | 5851 | ret = cache_block_group(block_group, 0); |
| 5668 | orig_root, 0); | ||
| 5669 | BUG_ON(ret < 0); | 5852 | BUG_ON(ret < 0); |
| 5670 | ret = 0; | 5853 | ret = 0; |
| 5671 | } | 5854 | } |
| @@ -5678,6 +5861,7 @@ have_block_group: | |||
| 5678 | * lets look there | 5861 | * lets look there |
| 5679 | */ | 5862 | */ |
| 5680 | if (last_ptr) { | 5863 | if (last_ptr) { |
| 5864 | unsigned long aligned_cluster; | ||
| 5681 | /* | 5865 | /* |
| 5682 | * the refill lock keeps out other | 5866 | * the refill lock keeps out other |
| 5683 | * people trying to start a new cluster | 5867 | * people trying to start a new cluster |
| @@ -5744,11 +5928,15 @@ refill_cluster: | |||
| 5744 | goto unclustered_alloc; | 5928 | goto unclustered_alloc; |
| 5745 | } | 5929 | } |
| 5746 | 5930 | ||
| 5931 | aligned_cluster = max_t(unsigned long, | ||
| 5932 | empty_cluster + empty_size, | ||
| 5933 | block_group->full_stripe_len); | ||
| 5934 | |||
| 5747 | /* allocate a cluster in this block group */ | 5935 | /* allocate a cluster in this block group */ |
| 5748 | ret = btrfs_find_space_cluster(trans, root, | 5936 | ret = btrfs_find_space_cluster(trans, root, |
| 5749 | block_group, last_ptr, | 5937 | block_group, last_ptr, |
| 5750 | search_start, num_bytes, | 5938 | search_start, num_bytes, |
| 5751 | empty_cluster + empty_size); | 5939 | aligned_cluster); |
| 5752 | if (ret == 0) { | 5940 | if (ret == 0) { |
| 5753 | /* | 5941 | /* |
| 5754 | * now pull our allocation out of this | 5942 | * now pull our allocation out of this |
| @@ -5819,7 +6007,8 @@ unclustered_alloc: | |||
| 5819 | goto loop; | 6007 | goto loop; |
| 5820 | } | 6008 | } |
| 5821 | checks: | 6009 | checks: |
| 5822 | search_start = stripe_align(root, offset); | 6010 | search_start = stripe_align(root, used_block_group, |
| 6011 | offset, num_bytes); | ||
| 5823 | 6012 | ||
| 5824 | /* move on to the next group */ | 6013 | /* move on to the next group */ |
| 5825 | if (search_start + num_bytes > | 6014 | if (search_start + num_bytes > |
| @@ -5970,7 +6159,7 @@ again: | |||
| 5970 | if (ret == -ENOSPC) { | 6159 | if (ret == -ENOSPC) { |
| 5971 | if (!final_tried) { | 6160 | if (!final_tried) { |
| 5972 | num_bytes = num_bytes >> 1; | 6161 | num_bytes = num_bytes >> 1; |
| 5973 | num_bytes = num_bytes & ~(root->sectorsize - 1); | 6162 | num_bytes = round_down(num_bytes, root->sectorsize); |
| 5974 | num_bytes = max(num_bytes, min_alloc_size); | 6163 | num_bytes = max(num_bytes, min_alloc_size); |
| 5975 | if (num_bytes == min_alloc_size) | 6164 | if (num_bytes == min_alloc_size) |
| 5976 | final_tried = true; | 6165 | final_tried = true; |
| @@ -6094,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, | |||
| 6094 | btrfs_mark_buffer_dirty(path->nodes[0]); | 6283 | btrfs_mark_buffer_dirty(path->nodes[0]); |
| 6095 | btrfs_free_path(path); | 6284 | btrfs_free_path(path); |
| 6096 | 6285 | ||
| 6097 | ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); | 6286 | ret = update_block_group(root, ins->objectid, ins->offset, 1); |
| 6098 | if (ret) { /* -ENOENT, logic error */ | 6287 | if (ret) { /* -ENOENT, logic error */ |
| 6099 | printk(KERN_ERR "btrfs update block group failed for %llu " | 6288 | printk(KERN_ERR "btrfs update block group failed for %llu " |
| 6100 | "%llu\n", (unsigned long long)ins->objectid, | 6289 | "%llu\n", (unsigned long long)ins->objectid, |
| @@ -6158,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, | |||
| 6158 | btrfs_mark_buffer_dirty(leaf); | 6347 | btrfs_mark_buffer_dirty(leaf); |
| 6159 | btrfs_free_path(path); | 6348 | btrfs_free_path(path); |
| 6160 | 6349 | ||
| 6161 | ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); | 6350 | ret = update_block_group(root, ins->objectid, ins->offset, 1); |
| 6162 | if (ret) { /* -ENOENT, logic error */ | 6351 | if (ret) { /* -ENOENT, logic error */ |
| 6163 | printk(KERN_ERR "btrfs update block group failed for %llu " | 6352 | printk(KERN_ERR "btrfs update block group failed for %llu " |
| 6164 | "%llu\n", (unsigned long long)ins->objectid, | 6353 | "%llu\n", (unsigned long long)ins->objectid, |
| @@ -6201,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, | |||
| 6201 | u64 num_bytes = ins->offset; | 6390 | u64 num_bytes = ins->offset; |
| 6202 | 6391 | ||
| 6203 | block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); | 6392 | block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); |
| 6204 | cache_block_group(block_group, trans, NULL, 0); | 6393 | cache_block_group(block_group, 0); |
| 6205 | caching_ctl = get_caching_control(block_group); | 6394 | caching_ctl = get_caching_control(block_group); |
| 6206 | 6395 | ||
| 6207 | if (!caching_ctl) { | 6396 | if (!caching_ctl) { |
| @@ -6315,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans, | |||
| 6315 | if (!ret) | 6504 | if (!ret) |
| 6316 | return block_rsv; | 6505 | return block_rsv; |
| 6317 | if (ret && !block_rsv->failfast) { | 6506 | if (ret && !block_rsv->failfast) { |
| 6318 | static DEFINE_RATELIMIT_STATE(_rs, | 6507 | if (btrfs_test_opt(root, ENOSPC_DEBUG)) { |
| 6319 | DEFAULT_RATELIMIT_INTERVAL, | 6508 | static DEFINE_RATELIMIT_STATE(_rs, |
| 6320 | /*DEFAULT_RATELIMIT_BURST*/ 2); | 6509 | DEFAULT_RATELIMIT_INTERVAL * 10, |
| 6321 | if (__ratelimit(&_rs)) | 6510 | /*DEFAULT_RATELIMIT_BURST*/ 1); |
| 6322 | WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n", | 6511 | if (__ratelimit(&_rs)) |
| 6323 | ret); | 6512 | WARN(1, KERN_DEBUG |
| 6513 | "btrfs: block rsv returned %d\n", ret); | ||
| 6514 | } | ||
| 6324 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, | 6515 | ret = reserve_metadata_bytes(root, block_rsv, blocksize, |
| 6325 | BTRFS_RESERVE_NO_FLUSH); | 6516 | BTRFS_RESERVE_NO_FLUSH); |
| 6326 | if (!ret) { | 6517 | if (!ret) { |
| @@ -6386,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, | |||
| 6386 | 6577 | ||
| 6387 | if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { | 6578 | if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { |
| 6388 | struct btrfs_delayed_extent_op *extent_op; | 6579 | struct btrfs_delayed_extent_op *extent_op; |
| 6389 | extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); | 6580 | extent_op = btrfs_alloc_delayed_extent_op(); |
| 6390 | BUG_ON(!extent_op); /* -ENOMEM */ | 6581 | BUG_ON(!extent_op); /* -ENOMEM */ |
| 6391 | if (key) | 6582 | if (key) |
| 6392 | memcpy(&extent_op->key, key, sizeof(extent_op->key)); | 6583 | memcpy(&extent_op->key, key, sizeof(extent_op->key)); |
| @@ -7189,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) | |||
| 7189 | root->fs_info->fs_devices->missing_devices; | 7380 | root->fs_info->fs_devices->missing_devices; |
| 7190 | 7381 | ||
| 7191 | stripped = BTRFS_BLOCK_GROUP_RAID0 | | 7382 | stripped = BTRFS_BLOCK_GROUP_RAID0 | |
| 7383 | BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 | | ||
| 7192 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; | 7384 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; |
| 7193 | 7385 | ||
| 7194 | if (num_devices == 1) { | 7386 | if (num_devices == 1) { |
| @@ -7467,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr) | |||
| 7467 | index = get_block_group_index(block_group); | 7659 | index = get_block_group_index(block_group); |
| 7468 | } | 7660 | } |
| 7469 | 7661 | ||
| 7470 | if (index == 0) { | 7662 | if (index == BTRFS_RAID_RAID10) { |
| 7471 | dev_min = 4; | 7663 | dev_min = 4; |
| 7472 | /* Divide by 2 */ | 7664 | /* Divide by 2 */ |
| 7473 | min_free >>= 1; | 7665 | min_free >>= 1; |
| 7474 | } else if (index == 1) { | 7666 | } else if (index == BTRFS_RAID_RAID1) { |
| 7475 | dev_min = 2; | 7667 | dev_min = 2; |
| 7476 | } else if (index == 2) { | 7668 | } else if (index == BTRFS_RAID_DUP) { |
| 7477 | /* Multiply by 2 */ | 7669 | /* Multiply by 2 */ |
| 7478 | min_free <<= 1; | 7670 | min_free <<= 1; |
| 7479 | } else if (index == 3) { | 7671 | } else if (index == BTRFS_RAID_RAID0) { |
| 7480 | dev_min = fs_devices->rw_devices; | 7672 | dev_min = fs_devices->rw_devices; |
| 7481 | do_div(min_free, dev_min); | 7673 | do_div(min_free, dev_min); |
| 7482 | } | 7674 | } |
| @@ -7637,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) | |||
| 7637 | space_info = list_entry(info->space_info.next, | 7829 | space_info = list_entry(info->space_info.next, |
| 7638 | struct btrfs_space_info, | 7830 | struct btrfs_space_info, |
| 7639 | list); | 7831 | list); |
| 7640 | if (space_info->bytes_pinned > 0 || | 7832 | if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) { |
| 7641 | space_info->bytes_reserved > 0 || | 7833 | if (space_info->bytes_pinned > 0 || |
| 7642 | space_info->bytes_may_use > 0) { | 7834 | space_info->bytes_reserved > 0 || |
| 7643 | WARN_ON(1); | 7835 | space_info->bytes_may_use > 0) { |
| 7644 | dump_space_info(space_info, 0, 0); | 7836 | WARN_ON(1); |
| 7837 | dump_space_info(space_info, 0, 0); | ||
| 7838 | } | ||
| 7645 | } | 7839 | } |
| 7646 | list_del(&space_info->list); | 7840 | list_del(&space_info->list); |
| 7647 | kfree(space_info); | 7841 | kfree(space_info); |
| @@ -7740,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
| 7740 | btrfs_release_path(path); | 7934 | btrfs_release_path(path); |
| 7741 | cache->flags = btrfs_block_group_flags(&cache->item); | 7935 | cache->flags = btrfs_block_group_flags(&cache->item); |
| 7742 | cache->sectorsize = root->sectorsize; | 7936 | cache->sectorsize = root->sectorsize; |
| 7743 | 7937 | cache->full_stripe_len = btrfs_full_stripe_len(root, | |
| 7938 | &root->fs_info->mapping_tree, | ||
| 7939 | found_key.objectid); | ||
| 7744 | btrfs_init_free_space_ctl(cache); | 7940 | btrfs_init_free_space_ctl(cache); |
| 7745 | 7941 | ||
| 7746 | /* | 7942 | /* |
| @@ -7794,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root) | |||
| 7794 | if (!(get_alloc_profile(root, space_info->flags) & | 7990 | if (!(get_alloc_profile(root, space_info->flags) & |
| 7795 | (BTRFS_BLOCK_GROUP_RAID10 | | 7991 | (BTRFS_BLOCK_GROUP_RAID10 | |
| 7796 | BTRFS_BLOCK_GROUP_RAID1 | | 7992 | BTRFS_BLOCK_GROUP_RAID1 | |
| 7993 | BTRFS_BLOCK_GROUP_RAID5 | | ||
| 7994 | BTRFS_BLOCK_GROUP_RAID6 | | ||
| 7797 | BTRFS_BLOCK_GROUP_DUP))) | 7995 | BTRFS_BLOCK_GROUP_DUP))) |
| 7798 | continue; | 7996 | continue; |
| 7799 | /* | 7997 | /* |
| @@ -7869,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, | |||
| 7869 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; | 8067 | cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; |
| 7870 | cache->sectorsize = root->sectorsize; | 8068 | cache->sectorsize = root->sectorsize; |
| 7871 | cache->fs_info = root->fs_info; | 8069 | cache->fs_info = root->fs_info; |
| 8070 | cache->full_stripe_len = btrfs_full_stripe_len(root, | ||
| 8071 | &root->fs_info->mapping_tree, | ||
| 8072 | chunk_offset); | ||
| 7872 | 8073 | ||
| 7873 | atomic_set(&cache->count, 1); | 8074 | atomic_set(&cache->count, 1); |
| 7874 | spin_lock_init(&cache->lock); | 8075 | spin_lock_init(&cache->lock); |
| @@ -7918,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) | |||
| 7918 | u64 extra_flags = chunk_to_extended(flags) & | 8119 | u64 extra_flags = chunk_to_extended(flags) & |
| 7919 | BTRFS_EXTENDED_PROFILE_MASK; | 8120 | BTRFS_EXTENDED_PROFILE_MASK; |
| 7920 | 8121 | ||
| 8122 | write_seqlock(&fs_info->profiles_lock); | ||
| 7921 | if (flags & BTRFS_BLOCK_GROUP_DATA) | 8123 | if (flags & BTRFS_BLOCK_GROUP_DATA) |
| 7922 | fs_info->avail_data_alloc_bits &= ~extra_flags; | 8124 | fs_info->avail_data_alloc_bits &= ~extra_flags; |
| 7923 | if (flags & BTRFS_BLOCK_GROUP_METADATA) | 8125 | if (flags & BTRFS_BLOCK_GROUP_METADATA) |
| 7924 | fs_info->avail_metadata_alloc_bits &= ~extra_flags; | 8126 | fs_info->avail_metadata_alloc_bits &= ~extra_flags; |
| 7925 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) | 8127 | if (flags & BTRFS_BLOCK_GROUP_SYSTEM) |
| 7926 | fs_info->avail_system_alloc_bits &= ~extra_flags; | 8128 | fs_info->avail_system_alloc_bits &= ~extra_flags; |
| 8129 | write_sequnlock(&fs_info->profiles_lock); | ||
| 7927 | } | 8130 | } |
| 7928 | 8131 | ||
| 7929 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | 8132 | int btrfs_remove_block_group(struct btrfs_trans_handle *trans, |
| @@ -8022,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, | |||
| 8022 | spin_lock(&root->fs_info->block_group_cache_lock); | 8225 | spin_lock(&root->fs_info->block_group_cache_lock); |
| 8023 | rb_erase(&block_group->cache_node, | 8226 | rb_erase(&block_group->cache_node, |
| 8024 | &root->fs_info->block_group_cache_tree); | 8227 | &root->fs_info->block_group_cache_tree); |
| 8228 | |||
| 8229 | if (root->fs_info->first_logical_byte == block_group->key.objectid) | ||
| 8230 | root->fs_info->first_logical_byte = (u64)-1; | ||
| 8025 | spin_unlock(&root->fs_info->block_group_cache_lock); | 8231 | spin_unlock(&root->fs_info->block_group_cache_lock); |
| 8026 | 8232 | ||
| 8027 | down_write(&block_group->space_info->groups_sem); | 8233 | down_write(&block_group->space_info->groups_sem); |
| @@ -8144,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range) | |||
| 8144 | 8350 | ||
| 8145 | if (end - start >= range->minlen) { | 8351 | if (end - start >= range->minlen) { |
| 8146 | if (!block_group_cache_done(cache)) { | 8352 | if (!block_group_cache_done(cache)) { |
| 8147 | ret = cache_block_group(cache, NULL, root, 0); | 8353 | ret = cache_block_group(cache, 0); |
| 8148 | if (!ret) | 8354 | if (!ret) |
| 8149 | wait_block_group_cache_done(cache); | 8355 | wait_block_group_cache_done(cache); |
| 8150 | } | 8356 | } |
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1b319df29eee..f173c5af6461 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
| @@ -4,7 +4,6 @@ | |||
| 4 | #include <linux/mm.h> | 4 | #include <linux/mm.h> |
| 5 | #include <linux/pagemap.h> | 5 | #include <linux/pagemap.h> |
| 6 | #include <linux/page-flags.h> | 6 | #include <linux/page-flags.h> |
| 7 | #include <linux/module.h> | ||
| 8 | #include <linux/spinlock.h> | 7 | #include <linux/spinlock.h> |
| 9 | #include <linux/blkdev.h> | 8 | #include <linux/blkdev.h> |
| 10 | #include <linux/swap.h> | 9 | #include <linux/swap.h> |
| @@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, | |||
| 1834 | */ | 1833 | */ |
| 1835 | static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) | 1834 | static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) |
| 1836 | { | 1835 | { |
| 1837 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 1836 | u64 start = page_offset(page); |
| 1838 | u64 end = start + PAGE_CACHE_SIZE - 1; | 1837 | u64 end = start + PAGE_CACHE_SIZE - 1; |
| 1839 | if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) | 1838 | if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL)) |
| 1840 | SetPageUptodate(page); | 1839 | SetPageUptodate(page); |
| @@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) | |||
| 1846 | */ | 1845 | */ |
| 1847 | static void check_page_locked(struct extent_io_tree *tree, struct page *page) | 1846 | static void check_page_locked(struct extent_io_tree *tree, struct page *page) |
| 1848 | { | 1847 | { |
| 1849 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 1848 | u64 start = page_offset(page); |
| 1850 | u64 end = start + PAGE_CACHE_SIZE - 1; | 1849 | u64 end = start + PAGE_CACHE_SIZE - 1; |
| 1851 | if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) | 1850 | if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) |
| 1852 | unlock_page(page); | 1851 | unlock_page(page); |
| @@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, | |||
| 1895 | if (ret) | 1894 | if (ret) |
| 1896 | err = ret; | 1895 | err = ret; |
| 1897 | 1896 | ||
| 1898 | if (did_repair) { | 1897 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, |
| 1899 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, | 1898 | rec->start + rec->len - 1, |
| 1900 | rec->start + rec->len - 1, | 1899 | EXTENT_DAMAGED, GFP_NOFS); |
| 1901 | EXTENT_DAMAGED, GFP_NOFS); | 1900 | if (ret && !err) |
| 1902 | if (ret && !err) | 1901 | err = ret; |
| 1903 | err = ret; | ||
| 1904 | } | ||
| 1905 | 1902 | ||
| 1906 | kfree(rec); | 1903 | kfree(rec); |
| 1907 | return err; | 1904 | return err; |
| @@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, | |||
| 1932 | u64 map_length = 0; | 1929 | u64 map_length = 0; |
| 1933 | u64 sector; | 1930 | u64 sector; |
| 1934 | struct btrfs_bio *bbio = NULL; | 1931 | struct btrfs_bio *bbio = NULL; |
| 1932 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | ||
| 1935 | int ret; | 1933 | int ret; |
| 1936 | 1934 | ||
| 1937 | BUG_ON(!mirror_num); | 1935 | BUG_ON(!mirror_num); |
| 1938 | 1936 | ||
| 1937 | /* we can't repair anything in raid56 yet */ | ||
| 1938 | if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) | ||
| 1939 | return 0; | ||
| 1940 | |||
| 1939 | bio = bio_alloc(GFP_NOFS, 1); | 1941 | bio = bio_alloc(GFP_NOFS, 1); |
| 1940 | if (!bio) | 1942 | if (!bio) |
| 1941 | return -EIO; | 1943 | return -EIO; |
| @@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, | |||
| 1960 | return -EIO; | 1962 | return -EIO; |
| 1961 | } | 1963 | } |
| 1962 | bio->bi_bdev = dev->bdev; | 1964 | bio->bi_bdev = dev->bdev; |
| 1963 | bio_add_page(bio, page, length, start-page_offset(page)); | 1965 | bio_add_page(bio, page, length, start - page_offset(page)); |
| 1964 | btrfsic_submit_bio(WRITE_SYNC, bio); | 1966 | btrfsic_submit_bio(WRITE_SYNC, bio); |
| 1965 | wait_for_completion(&compl); | 1967 | wait_for_completion(&compl); |
| 1966 | 1968 | ||
| @@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page) | |||
| 2052 | failrec->failed_mirror); | 2054 | failrec->failed_mirror); |
| 2053 | did_repair = !ret; | 2055 | did_repair = !ret; |
| 2054 | } | 2056 | } |
| 2057 | ret = 0; | ||
| 2055 | } | 2058 | } |
| 2056 | 2059 | ||
| 2057 | out: | 2060 | out: |
| @@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err) | |||
| 2293 | struct page *page = bvec->bv_page; | 2296 | struct page *page = bvec->bv_page; |
| 2294 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 2297 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
| 2295 | 2298 | ||
| 2296 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | 2299 | start = page_offset(page) + bvec->bv_offset; |
| 2297 | bvec->bv_offset; | ||
| 2298 | end = start + bvec->bv_len - 1; | 2300 | end = start + bvec->bv_len - 1; |
| 2299 | 2301 | ||
| 2300 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) | 2302 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) |
| @@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
| 2353 | (long int)bio->bi_bdev); | 2355 | (long int)bio->bi_bdev); |
| 2354 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 2356 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
| 2355 | 2357 | ||
| 2356 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | 2358 | start = page_offset(page) + bvec->bv_offset; |
| 2357 | bvec->bv_offset; | ||
| 2358 | end = start + bvec->bv_len - 1; | 2359 | end = start + bvec->bv_len - 1; |
| 2359 | 2360 | ||
| 2360 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) | 2361 | if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) |
| @@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, | |||
| 2471 | struct extent_io_tree *tree = bio->bi_private; | 2472 | struct extent_io_tree *tree = bio->bi_private; |
| 2472 | u64 start; | 2473 | u64 start; |
| 2473 | 2474 | ||
| 2474 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; | 2475 | start = page_offset(page) + bvec->bv_offset; |
| 2475 | 2476 | ||
| 2476 | bio->bi_private = NULL; | 2477 | bio->bi_private = NULL; |
| 2477 | 2478 | ||
| @@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, | |||
| 2489 | return ret; | 2490 | return ret; |
| 2490 | } | 2491 | } |
| 2491 | 2492 | ||
| 2492 | static int merge_bio(struct extent_io_tree *tree, struct page *page, | 2493 | static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page, |
| 2493 | unsigned long offset, size_t size, struct bio *bio, | 2494 | unsigned long offset, size_t size, struct bio *bio, |
| 2494 | unsigned long bio_flags) | 2495 | unsigned long bio_flags) |
| 2495 | { | 2496 | { |
| 2496 | int ret = 0; | 2497 | int ret = 0; |
| 2497 | if (tree->ops && tree->ops->merge_bio_hook) | 2498 | if (tree->ops && tree->ops->merge_bio_hook) |
| 2498 | ret = tree->ops->merge_bio_hook(page, offset, size, bio, | 2499 | ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio, |
| 2499 | bio_flags); | 2500 | bio_flags); |
| 2500 | BUG_ON(ret < 0); | 2501 | BUG_ON(ret < 0); |
| 2501 | return ret; | 2502 | return ret; |
| @@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, | |||
| 2530 | sector; | 2531 | sector; |
| 2531 | 2532 | ||
| 2532 | if (prev_bio_flags != bio_flags || !contig || | 2533 | if (prev_bio_flags != bio_flags || !contig || |
| 2533 | merge_bio(tree, page, offset, page_size, bio, bio_flags) || | 2534 | merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || |
| 2534 | bio_add_page(bio, page, page_size, offset) < page_size) { | 2535 | bio_add_page(bio, page, page_size, offset) < page_size) { |
| 2535 | ret = submit_one_bio(rw, bio, mirror_num, | 2536 | ret = submit_one_bio(rw, bio, mirror_num, |
| 2536 | prev_bio_flags); | 2537 | prev_bio_flags); |
| @@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
| 2595 | unsigned long *bio_flags) | 2596 | unsigned long *bio_flags) |
| 2596 | { | 2597 | { |
| 2597 | struct inode *inode = page->mapping->host; | 2598 | struct inode *inode = page->mapping->host; |
| 2598 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 2599 | u64 start = page_offset(page); |
| 2599 | u64 page_end = start + PAGE_CACHE_SIZE - 1; | 2600 | u64 page_end = start + PAGE_CACHE_SIZE - 1; |
| 2600 | u64 end; | 2601 | u64 end; |
| 2601 | u64 cur = start; | 2602 | u64 cur = start; |
| @@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
| 2648 | } | 2649 | } |
| 2649 | } | 2650 | } |
| 2650 | while (cur <= end) { | 2651 | while (cur <= end) { |
| 2652 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; | ||
| 2653 | |||
| 2651 | if (cur >= last_byte) { | 2654 | if (cur >= last_byte) { |
| 2652 | char *userpage; | 2655 | char *userpage; |
| 2653 | struct extent_state *cached = NULL; | 2656 | struct extent_state *cached = NULL; |
| @@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
| 2682 | 2685 | ||
| 2683 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | 2686 | iosize = min(extent_map_end(em) - cur, end - cur + 1); |
| 2684 | cur_end = min(extent_map_end(em) - 1, end); | 2687 | cur_end = min(extent_map_end(em) - 1, end); |
| 2685 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | 2688 | iosize = ALIGN(iosize, blocksize); |
| 2686 | if (this_bio_flag & EXTENT_BIO_COMPRESSED) { | 2689 | if (this_bio_flag & EXTENT_BIO_COMPRESSED) { |
| 2687 | disk_io_size = em->block_len; | 2690 | disk_io_size = em->block_len; |
| 2688 | sector = em->block_start >> 9; | 2691 | sector = em->block_start >> 9; |
| @@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
| 2735 | continue; | 2738 | continue; |
| 2736 | } | 2739 | } |
| 2737 | 2740 | ||
| 2738 | ret = 0; | 2741 | pnr -= page->index; |
| 2739 | if (tree->ops && tree->ops->readpage_io_hook) { | 2742 | ret = submit_extent_page(READ, tree, page, |
| 2740 | ret = tree->ops->readpage_io_hook(page, cur, | ||
| 2741 | cur + iosize - 1); | ||
| 2742 | } | ||
| 2743 | if (!ret) { | ||
| 2744 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; | ||
| 2745 | pnr -= page->index; | ||
| 2746 | ret = submit_extent_page(READ, tree, page, | ||
| 2747 | sector, disk_io_size, pg_offset, | 2743 | sector, disk_io_size, pg_offset, |
| 2748 | bdev, bio, pnr, | 2744 | bdev, bio, pnr, |
| 2749 | end_bio_extent_readpage, mirror_num, | 2745 | end_bio_extent_readpage, mirror_num, |
| 2750 | *bio_flags, | 2746 | *bio_flags, |
| 2751 | this_bio_flag); | 2747 | this_bio_flag); |
| 2752 | if (!ret) { | 2748 | if (!ret) { |
| 2753 | nr++; | 2749 | nr++; |
| 2754 | *bio_flags = this_bio_flag; | 2750 | *bio_flags = this_bio_flag; |
| 2755 | } | 2751 | } else { |
| 2756 | } | ||
| 2757 | if (ret) { | ||
| 2758 | SetPageError(page); | 2752 | SetPageError(page); |
| 2759 | unlock_extent(tree, cur, cur + iosize - 1); | 2753 | unlock_extent(tree, cur, cur + iosize - 1); |
| 2760 | } | 2754 | } |
| @@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
| 2806 | struct inode *inode = page->mapping->host; | 2800 | struct inode *inode = page->mapping->host; |
| 2807 | struct extent_page_data *epd = data; | 2801 | struct extent_page_data *epd = data; |
| 2808 | struct extent_io_tree *tree = epd->tree; | 2802 | struct extent_io_tree *tree = epd->tree; |
| 2809 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 2803 | u64 start = page_offset(page); |
| 2810 | u64 delalloc_start; | 2804 | u64 delalloc_start; |
| 2811 | u64 page_end = start + PAGE_CACHE_SIZE - 1; | 2805 | u64 page_end = start + PAGE_CACHE_SIZE - 1; |
| 2812 | u64 end; | 2806 | u64 end; |
| @@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
| 2982 | BUG_ON(extent_map_end(em) <= cur); | 2976 | BUG_ON(extent_map_end(em) <= cur); |
| 2983 | BUG_ON(end < cur); | 2977 | BUG_ON(end < cur); |
| 2984 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | 2978 | iosize = min(extent_map_end(em) - cur, end - cur + 1); |
| 2985 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | 2979 | iosize = ALIGN(iosize, blocksize); |
| 2986 | sector = (em->block_start + extent_offset) >> 9; | 2980 | sector = (em->block_start + extent_offset) >> 9; |
| 2987 | bdev = em->bdev; | 2981 | bdev = em->bdev; |
| 2988 | block_start = em->block_start; | 2982 | block_start = em->block_start; |
| @@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb, | |||
| 3124 | set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); | 3118 | set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); |
| 3125 | spin_unlock(&eb->refs_lock); | 3119 | spin_unlock(&eb->refs_lock); |
| 3126 | btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); | 3120 | btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); |
| 3127 | spin_lock(&fs_info->delalloc_lock); | 3121 | __percpu_counter_add(&fs_info->dirty_metadata_bytes, |
| 3128 | if (fs_info->dirty_metadata_bytes >= eb->len) | 3122 | -eb->len, |
| 3129 | fs_info->dirty_metadata_bytes -= eb->len; | 3123 | fs_info->dirty_metadata_batch); |
| 3130 | else | ||
| 3131 | WARN_ON(1); | ||
| 3132 | spin_unlock(&fs_info->delalloc_lock); | ||
| 3133 | ret = 1; | 3124 | ret = 1; |
| 3134 | } else { | 3125 | } else { |
| 3135 | spin_unlock(&eb->refs_lock); | 3126 | spin_unlock(&eb->refs_lock); |
| @@ -3446,15 +3437,9 @@ retry: | |||
| 3446 | * swizzled back from swapper_space to tmpfs file | 3437 | * swizzled back from swapper_space to tmpfs file |
| 3447 | * mapping | 3438 | * mapping |
| 3448 | */ | 3439 | */ |
| 3449 | if (tree->ops && | 3440 | if (!trylock_page(page)) { |
| 3450 | tree->ops->write_cache_pages_lock_hook) { | 3441 | flush_fn(data); |
| 3451 | tree->ops->write_cache_pages_lock_hook(page, | 3442 | lock_page(page); |
| 3452 | data, flush_fn); | ||
| 3453 | } else { | ||
| 3454 | if (!trylock_page(page)) { | ||
| 3455 | flush_fn(data); | ||
| 3456 | lock_page(page); | ||
| 3457 | } | ||
| 3458 | } | 3443 | } |
| 3459 | 3444 | ||
| 3460 | if (unlikely(page->mapping != mapping)) { | 3445 | if (unlikely(page->mapping != mapping)) { |
| @@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree, | |||
| 3674 | struct page *page, unsigned long offset) | 3659 | struct page *page, unsigned long offset) |
| 3675 | { | 3660 | { |
| 3676 | struct extent_state *cached_state = NULL; | 3661 | struct extent_state *cached_state = NULL; |
| 3677 | u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); | 3662 | u64 start = page_offset(page); |
| 3678 | u64 end = start + PAGE_CACHE_SIZE - 1; | 3663 | u64 end = start + PAGE_CACHE_SIZE - 1; |
| 3679 | size_t blocksize = page->mapping->host->i_sb->s_blocksize; | 3664 | size_t blocksize = page->mapping->host->i_sb->s_blocksize; |
| 3680 | 3665 | ||
| 3681 | start += (offset + blocksize - 1) & ~(blocksize - 1); | 3666 | start += ALIGN(offset, blocksize); |
| 3682 | if (start > end) | 3667 | if (start > end) |
| 3683 | return 0; | 3668 | return 0; |
| 3684 | 3669 | ||
| @@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map, | |||
| 3700 | struct extent_io_tree *tree, struct page *page, | 3685 | struct extent_io_tree *tree, struct page *page, |
| 3701 | gfp_t mask) | 3686 | gfp_t mask) |
| 3702 | { | 3687 | { |
| 3703 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 3688 | u64 start = page_offset(page); |
| 3704 | u64 end = start + PAGE_CACHE_SIZE - 1; | 3689 | u64 end = start + PAGE_CACHE_SIZE - 1; |
| 3705 | int ret = 1; | 3690 | int ret = 1; |
| 3706 | 3691 | ||
| @@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, | |||
| 3739 | gfp_t mask) | 3724 | gfp_t mask) |
| 3740 | { | 3725 | { |
| 3741 | struct extent_map *em; | 3726 | struct extent_map *em; |
| 3742 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 3727 | u64 start = page_offset(page); |
| 3743 | u64 end = start + PAGE_CACHE_SIZE - 1; | 3728 | u64 end = start + PAGE_CACHE_SIZE - 1; |
| 3744 | 3729 | ||
| 3745 | if ((mask & __GFP_WAIT) && | 3730 | if ((mask & __GFP_WAIT) && |
| @@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, | |||
| 3797 | len = last - offset; | 3782 | len = last - offset; |
| 3798 | if (len == 0) | 3783 | if (len == 0) |
| 3799 | break; | 3784 | break; |
| 3800 | len = (len + sectorsize - 1) & ~(sectorsize - 1); | 3785 | len = ALIGN(len, sectorsize); |
| 3801 | em = get_extent(inode, NULL, 0, offset, len, 0); | 3786 | em = get_extent(inode, NULL, 0, offset, len, 0); |
| 3802 | if (IS_ERR_OR_NULL(em)) | 3787 | if (IS_ERR_OR_NULL(em)) |
| 3803 | return em; | 3788 | return em; |
| @@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb) | |||
| 3995 | list_del(&eb->leak_list); | 3980 | list_del(&eb->leak_list); |
| 3996 | spin_unlock_irqrestore(&leak_lock, flags); | 3981 | spin_unlock_irqrestore(&leak_lock, flags); |
| 3997 | #endif | 3982 | #endif |
| 3998 | if (eb->pages && eb->pages != eb->inline_pages) | ||
| 3999 | kfree(eb->pages); | ||
| 4000 | kmem_cache_free(extent_buffer_cache, eb); | 3983 | kmem_cache_free(extent_buffer_cache, eb); |
| 4001 | } | 3984 | } |
| 4002 | 3985 | ||
| @@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, | |||
| 4037 | atomic_set(&eb->refs, 1); | 4020 | atomic_set(&eb->refs, 1); |
| 4038 | atomic_set(&eb->io_pages, 0); | 4021 | atomic_set(&eb->io_pages, 0); |
| 4039 | 4022 | ||
| 4040 | if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) { | 4023 | /* |
| 4041 | struct page **pages; | 4024 | * Sanity checks, currently the maximum is 64k covered by 16x 4k pages |
| 4042 | int num_pages = (len + PAGE_CACHE_SIZE - 1) >> | 4025 | */ |
| 4043 | PAGE_CACHE_SHIFT; | 4026 | BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE |
| 4044 | pages = kzalloc(num_pages, mask); | 4027 | > MAX_INLINE_EXTENT_BUFFER_SIZE); |
| 4045 | if (!pages) { | 4028 | BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE); |
| 4046 | __free_extent_buffer(eb); | ||
| 4047 | return NULL; | ||
| 4048 | } | ||
| 4049 | eb->pages = pages; | ||
| 4050 | } else { | ||
| 4051 | eb->pages = eb->inline_pages; | ||
| 4052 | } | ||
| 4053 | 4029 | ||
| 4054 | return eb; | 4030 | return eb; |
| 4055 | } | 4031 | } |
| @@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) | |||
| 4180 | 4156 | ||
| 4181 | static void check_buffer_tree_ref(struct extent_buffer *eb) | 4157 | static void check_buffer_tree_ref(struct extent_buffer *eb) |
| 4182 | { | 4158 | { |
| 4159 | int refs; | ||
| 4183 | /* the ref bit is tricky. We have to make sure it is set | 4160 | /* the ref bit is tricky. We have to make sure it is set |
| 4184 | * if we have the buffer dirty. Otherwise the | 4161 | * if we have the buffer dirty. Otherwise the |
| 4185 | * code to free a buffer can end up dropping a dirty | 4162 | * code to free a buffer can end up dropping a dirty |
| @@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) | |||
| 4200 | * So bump the ref count first, then set the bit. If someone | 4177 | * So bump the ref count first, then set the bit. If someone |
| 4201 | * beat us to it, drop the ref we added. | 4178 | * beat us to it, drop the ref we added. |
| 4202 | */ | 4179 | */ |
| 4180 | refs = atomic_read(&eb->refs); | ||
| 4181 | if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) | ||
| 4182 | return; | ||
| 4183 | |||
| 4203 | spin_lock(&eb->refs_lock); | 4184 | spin_lock(&eb->refs_lock); |
| 4204 | if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) | 4185 | if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) |
| 4205 | atomic_inc(&eb->refs); | 4186 | atomic_inc(&eb->refs); |
| @@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask) | |||
| 4401 | 4382 | ||
| 4402 | void free_extent_buffer(struct extent_buffer *eb) | 4383 | void free_extent_buffer(struct extent_buffer *eb) |
| 4403 | { | 4384 | { |
| 4385 | int refs; | ||
| 4386 | int old; | ||
| 4404 | if (!eb) | 4387 | if (!eb) |
| 4405 | return; | 4388 | return; |
| 4406 | 4389 | ||
| 4390 | while (1) { | ||
| 4391 | refs = atomic_read(&eb->refs); | ||
| 4392 | if (refs <= 3) | ||
| 4393 | break; | ||
| 4394 | old = atomic_cmpxchg(&eb->refs, refs, refs - 1); | ||
| 4395 | if (old == refs) | ||
| 4396 | return; | ||
| 4397 | } | ||
| 4398 | |||
| 4407 | spin_lock(&eb->refs_lock); | 4399 | spin_lock(&eb->refs_lock); |
| 4408 | if (atomic_read(&eb->refs) == 2 && | 4400 | if (atomic_read(&eb->refs) == 2 && |
| 4409 | test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) | 4401 | test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) |
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2eacfabd3263..6068a1985560 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h | |||
| @@ -72,10 +72,9 @@ struct extent_io_ops { | |||
| 72 | int (*writepage_start_hook)(struct page *page, u64 start, u64 end); | 72 | int (*writepage_start_hook)(struct page *page, u64 start, u64 end); |
| 73 | int (*writepage_io_hook)(struct page *page, u64 start, u64 end); | 73 | int (*writepage_io_hook)(struct page *page, u64 start, u64 end); |
| 74 | extent_submit_bio_hook_t *submit_bio_hook; | 74 | extent_submit_bio_hook_t *submit_bio_hook; |
| 75 | int (*merge_bio_hook)(struct page *page, unsigned long offset, | 75 | int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, |
| 76 | size_t size, struct bio *bio, | 76 | size_t size, struct bio *bio, |
| 77 | unsigned long bio_flags); | 77 | unsigned long bio_flags); |
| 78 | int (*readpage_io_hook)(struct page *page, u64 start, u64 end); | ||
| 79 | int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); | 78 | int (*readpage_io_failed_hook)(struct page *page, int failed_mirror); |
| 80 | int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, | 79 | int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, |
| 81 | struct extent_state *state, int mirror); | 80 | struct extent_state *state, int mirror); |
| @@ -90,8 +89,6 @@ struct extent_io_ops { | |||
| 90 | struct extent_state *other); | 89 | struct extent_state *other); |
| 91 | void (*split_extent_hook)(struct inode *inode, | 90 | void (*split_extent_hook)(struct inode *inode, |
| 92 | struct extent_state *orig, u64 split); | 91 | struct extent_state *orig, u64 split); |
| 93 | int (*write_cache_pages_lock_hook)(struct page *page, void *data, | ||
| 94 | void (*flush_fn)(void *)); | ||
| 95 | }; | 92 | }; |
| 96 | 93 | ||
| 97 | struct extent_io_tree { | 94 | struct extent_io_tree { |
| @@ -161,8 +158,7 @@ struct extent_buffer { | |||
| 161 | */ | 158 | */ |
| 162 | wait_queue_head_t read_lock_wq; | 159 | wait_queue_head_t read_lock_wq; |
| 163 | wait_queue_head_t lock_wq; | 160 | wait_queue_head_t lock_wq; |
| 164 | struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES]; | 161 | struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; |
| 165 | struct page **pages; | ||
| 166 | }; | 162 | }; |
| 167 | 163 | ||
| 168 | static inline void extent_set_compress_type(unsigned long *bio_flags, | 164 | static inline void extent_set_compress_type(unsigned long *bio_flags, |
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fdb7a8db3b57..2834ca5768ea 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c | |||
| @@ -1,6 +1,5 @@ | |||
| 1 | #include <linux/err.h> | 1 | #include <linux/err.h> |
| 2 | #include <linux/slab.h> | 2 | #include <linux/slab.h> |
| 3 | #include <linux/module.h> | ||
| 4 | #include <linux/spinlock.h> | 3 | #include <linux/spinlock.h> |
| 5 | #include <linux/hardirq.h> | 4 | #include <linux/hardirq.h> |
| 6 | #include "ctree.h" | 5 | #include "ctree.h" |
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 94aa53b38721..ec160202be3e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c | |||
| @@ -684,6 +684,24 @@ out: | |||
| 684 | return ret; | 684 | return ret; |
| 685 | } | 685 | } |
| 686 | 686 | ||
| 687 | static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums, | ||
| 688 | struct btrfs_sector_sum *sector_sum, | ||
| 689 | u64 total_bytes, u64 sectorsize) | ||
| 690 | { | ||
| 691 | u64 tmp = sectorsize; | ||
| 692 | u64 next_sector = sector_sum->bytenr; | ||
| 693 | struct btrfs_sector_sum *next = sector_sum + 1; | ||
| 694 | |||
| 695 | while ((tmp + total_bytes) < sums->len) { | ||
| 696 | if (next_sector + sectorsize != next->bytenr) | ||
| 697 | break; | ||
| 698 | tmp += sectorsize; | ||
| 699 | next_sector = next->bytenr; | ||
| 700 | next++; | ||
| 701 | } | ||
| 702 | return tmp; | ||
| 703 | } | ||
| 704 | |||
| 687 | int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, | 705 | int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, |
| 688 | struct btrfs_root *root, | 706 | struct btrfs_root *root, |
| 689 | struct btrfs_ordered_sum *sums) | 707 | struct btrfs_ordered_sum *sums) |
| @@ -789,20 +807,32 @@ again: | |||
| 789 | goto insert; | 807 | goto insert; |
| 790 | } | 808 | } |
| 791 | 809 | ||
| 792 | if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / | 810 | if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) / |
| 793 | csum_size) { | 811 | csum_size) { |
| 794 | u32 diff = (csum_offset + 1) * csum_size; | 812 | int extend_nr; |
| 813 | u64 tmp; | ||
| 814 | u32 diff; | ||
| 815 | u32 free_space; | ||
| 795 | 816 | ||
| 796 | /* | 817 | if (btrfs_leaf_free_space(root, leaf) < |
| 797 | * is the item big enough already? we dropped our lock | 818 | sizeof(struct btrfs_item) + csum_size * 2) |
| 798 | * before and need to recheck | 819 | goto insert; |
| 799 | */ | 820 | |
| 800 | if (diff < btrfs_item_size_nr(leaf, path->slots[0])) | 821 | free_space = btrfs_leaf_free_space(root, leaf) - |
| 801 | goto csum; | 822 | sizeof(struct btrfs_item) - csum_size; |
| 823 | tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, | ||
| 824 | root->sectorsize); | ||
| 825 | tmp >>= root->fs_info->sb->s_blocksize_bits; | ||
| 826 | WARN_ON(tmp < 1); | ||
| 827 | |||
| 828 | extend_nr = max_t(int, 1, (int)tmp); | ||
| 829 | diff = (csum_offset + extend_nr) * csum_size; | ||
| 830 | diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size); | ||
| 802 | 831 | ||
| 803 | diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); | 832 | diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); |
| 804 | if (diff != csum_size) | 833 | diff = min(free_space, diff); |
| 805 | goto insert; | 834 | diff /= csum_size; |
| 835 | diff *= csum_size; | ||
| 806 | 836 | ||
| 807 | btrfs_extend_item(trans, root, path, diff); | 837 | btrfs_extend_item(trans, root, path, diff); |
| 808 | goto csum; | 838 | goto csum; |
| @@ -812,19 +842,14 @@ insert: | |||
| 812 | btrfs_release_path(path); | 842 | btrfs_release_path(path); |
| 813 | csum_offset = 0; | 843 | csum_offset = 0; |
| 814 | if (found_next) { | 844 | if (found_next) { |
| 815 | u64 tmp = total_bytes + root->sectorsize; | 845 | u64 tmp; |
| 816 | u64 next_sector = sector_sum->bytenr; | ||
| 817 | struct btrfs_sector_sum *next = sector_sum + 1; | ||
| 818 | 846 | ||
| 819 | while (tmp < sums->len) { | 847 | tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes, |
| 820 | if (next_sector + root->sectorsize != next->bytenr) | 848 | root->sectorsize); |
| 821 | break; | ||
| 822 | tmp += root->sectorsize; | ||
| 823 | next_sector = next->bytenr; | ||
| 824 | next++; | ||
| 825 | } | ||
| 826 | tmp = min(tmp, next_offset - file_key.offset); | ||
| 827 | tmp >>= root->fs_info->sb->s_blocksize_bits; | 849 | tmp >>= root->fs_info->sb->s_blocksize_bits; |
| 850 | tmp = min(tmp, (next_offset - file_key.offset) >> | ||
| 851 | root->fs_info->sb->s_blocksize_bits); | ||
| 852 | |||
| 828 | tmp = max((u64)1, tmp); | 853 | tmp = max((u64)1, tmp); |
| 829 | tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); | 854 | tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); |
| 830 | ins_size = csum_size * tmp; | 855 | ins_size = csum_size * tmp; |
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4b241fe9d2fe..af1d0605a5c1 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c | |||
| @@ -30,11 +30,11 @@ | |||
| 30 | #include <linux/statfs.h> | 30 | #include <linux/statfs.h> |
| 31 | #include <linux/compat.h> | 31 | #include <linux/compat.h> |
| 32 | #include <linux/slab.h> | 32 | #include <linux/slab.h> |
| 33 | #include <linux/btrfs.h> | ||
| 33 | #include "ctree.h" | 34 | #include "ctree.h" |
| 34 | #include "disk-io.h" | 35 | #include "disk-io.h" |
| 35 | #include "transaction.h" | 36 | #include "transaction.h" |
| 36 | #include "btrfs_inode.h" | 37 | #include "btrfs_inode.h" |
| 37 | #include "ioctl.h" | ||
| 38 | #include "print-tree.h" | 38 | #include "print-tree.h" |
| 39 | #include "tree-log.h" | 39 | #include "tree-log.h" |
| 40 | #include "locking.h" | 40 | #include "locking.h" |
| @@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) | |||
| 374 | 374 | ||
| 375 | atomic_inc(&fs_info->defrag_running); | 375 | atomic_inc(&fs_info->defrag_running); |
| 376 | while(1) { | 376 | while(1) { |
| 377 | /* Pause the auto defragger. */ | ||
| 378 | if (test_bit(BTRFS_FS_STATE_REMOUNTING, | ||
| 379 | &fs_info->fs_state)) | ||
| 380 | break; | ||
| 381 | |||
| 377 | if (!__need_auto_defrag(fs_info->tree_root)) | 382 | if (!__need_auto_defrag(fs_info->tree_root)) |
| 378 | break; | 383 | break; |
| 379 | 384 | ||
| @@ -505,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, | |||
| 505 | loff_t isize = i_size_read(inode); | 510 | loff_t isize = i_size_read(inode); |
| 506 | 511 | ||
| 507 | start_pos = pos & ~((u64)root->sectorsize - 1); | 512 | start_pos = pos & ~((u64)root->sectorsize - 1); |
| 508 | num_bytes = (write_bytes + pos - start_pos + | 513 | num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize); |
| 509 | root->sectorsize - 1) & ~((u64)root->sectorsize - 1); | ||
| 510 | 514 | ||
| 511 | end_of_last_block = start_pos + num_bytes - 1; | 515 | end_of_last_block = start_pos + num_bytes - 1; |
| 512 | err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, | 516 | err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, |
| @@ -1544,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb, | |||
| 1544 | * although we have opened a file as writable, we have | 1548 | * although we have opened a file as writable, we have |
| 1545 | * to stop this write operation to ensure FS consistency. | 1549 | * to stop this write operation to ensure FS consistency. |
| 1546 | */ | 1550 | */ |
| 1547 | if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 1551 | if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { |
| 1548 | mutex_unlock(&inode->i_mutex); | 1552 | mutex_unlock(&inode->i_mutex); |
| 1549 | err = -EROFS; | 1553 | err = -EROFS; |
| 1550 | goto out; | 1554 | goto out; |
| @@ -1627,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) | |||
| 1627 | */ | 1631 | */ |
| 1628 | if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, | 1632 | if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, |
| 1629 | &BTRFS_I(inode)->runtime_flags)) { | 1633 | &BTRFS_I(inode)->runtime_flags)) { |
| 1630 | btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode); | 1634 | struct btrfs_trans_handle *trans; |
| 1635 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
| 1636 | |||
| 1637 | /* | ||
| 1638 | * We need to block on a committing transaction to keep us from | ||
| 1639 | * throwing a ordered operation on to the list and causing | ||
| 1640 | * something like sync to deadlock trying to flush out this | ||
| 1641 | * inode. | ||
| 1642 | */ | ||
| 1643 | trans = btrfs_start_transaction(root, 0); | ||
| 1644 | if (IS_ERR(trans)) | ||
| 1645 | return PTR_ERR(trans); | ||
| 1646 | btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode); | ||
| 1647 | btrfs_end_transaction(trans, root); | ||
| 1631 | if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) | 1648 | if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) |
| 1632 | filemap_flush(inode->i_mapping); | 1649 | filemap_flush(inode->i_mapping); |
| 1633 | } | 1650 | } |
| @@ -1654,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
| 1654 | struct btrfs_root *root = BTRFS_I(inode)->root; | 1671 | struct btrfs_root *root = BTRFS_I(inode)->root; |
| 1655 | int ret = 0; | 1672 | int ret = 0; |
| 1656 | struct btrfs_trans_handle *trans; | 1673 | struct btrfs_trans_handle *trans; |
| 1674 | bool full_sync = 0; | ||
| 1657 | 1675 | ||
| 1658 | trace_btrfs_sync_file(file, datasync); | 1676 | trace_btrfs_sync_file(file, datasync); |
| 1659 | 1677 | ||
| 1660 | /* | 1678 | /* |
| 1661 | * We write the dirty pages in the range and wait until they complete | 1679 | * We write the dirty pages in the range and wait until they complete |
| 1662 | * out of the ->i_mutex. If so, we can flush the dirty pages by | 1680 | * out of the ->i_mutex. If so, we can flush the dirty pages by |
| 1663 | * multi-task, and make the performance up. | 1681 | * multi-task, and make the performance up. See |
| 1682 | * btrfs_wait_ordered_range for an explanation of the ASYNC check. | ||
| 1664 | */ | 1683 | */ |
| 1665 | atomic_inc(&BTRFS_I(inode)->sync_writers); | 1684 | atomic_inc(&BTRFS_I(inode)->sync_writers); |
| 1666 | ret = filemap_write_and_wait_range(inode->i_mapping, start, end); | 1685 | ret = filemap_fdatawrite_range(inode->i_mapping, start, end); |
| 1686 | if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, | ||
| 1687 | &BTRFS_I(inode)->runtime_flags)) | ||
| 1688 | ret = filemap_fdatawrite_range(inode->i_mapping, start, end); | ||
| 1667 | atomic_dec(&BTRFS_I(inode)->sync_writers); | 1689 | atomic_dec(&BTRFS_I(inode)->sync_writers); |
| 1668 | if (ret) | 1690 | if (ret) |
| 1669 | return ret; | 1691 | return ret; |
| @@ -1675,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
| 1675 | * range being left. | 1697 | * range being left. |
| 1676 | */ | 1698 | */ |
| 1677 | atomic_inc(&root->log_batch); | 1699 | atomic_inc(&root->log_batch); |
| 1678 | btrfs_wait_ordered_range(inode, start, end - start + 1); | 1700 | full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, |
| 1701 | &BTRFS_I(inode)->runtime_flags); | ||
| 1702 | if (full_sync) | ||
| 1703 | btrfs_wait_ordered_range(inode, start, end - start + 1); | ||
| 1679 | atomic_inc(&root->log_batch); | 1704 | atomic_inc(&root->log_batch); |
| 1680 | 1705 | ||
| 1681 | /* | 1706 | /* |
| @@ -1742,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) | |||
| 1742 | 1767 | ||
| 1743 | if (ret != BTRFS_NO_LOG_SYNC) { | 1768 | if (ret != BTRFS_NO_LOG_SYNC) { |
| 1744 | if (ret > 0) { | 1769 | if (ret > 0) { |
| 1770 | /* | ||
| 1771 | * If we didn't already wait for ordered extents we need | ||
| 1772 | * to do that now. | ||
| 1773 | */ | ||
| 1774 | if (!full_sync) | ||
| 1775 | btrfs_wait_ordered_range(inode, start, | ||
| 1776 | end - start + 1); | ||
| 1745 | ret = btrfs_commit_transaction(trans, root); | 1777 | ret = btrfs_commit_transaction(trans, root); |
| 1746 | } else { | 1778 | } else { |
| 1747 | ret = btrfs_sync_log(trans, root); | 1779 | ret = btrfs_sync_log(trans, root); |
| 1748 | if (ret == 0) | 1780 | if (ret == 0) { |
| 1749 | ret = btrfs_end_transaction(trans, root); | 1781 | ret = btrfs_end_transaction(trans, root); |
| 1750 | else | 1782 | } else { |
| 1783 | if (!full_sync) | ||
| 1784 | btrfs_wait_ordered_range(inode, start, | ||
| 1785 | end - | ||
| 1786 | start + 1); | ||
| 1751 | ret = btrfs_commit_transaction(trans, root); | 1787 | ret = btrfs_commit_transaction(trans, root); |
| 1788 | } | ||
| 1752 | } | 1789 | } |
| 1753 | } else { | 1790 | } else { |
| 1754 | ret = btrfs_end_transaction(trans, root); | 1791 | ret = btrfs_end_transaction(trans, root); |
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 0be7a8742a43..1f84fc09c1a8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c | |||
| @@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) | |||
| 1356 | u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; | 1356 | u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit; |
| 1357 | int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); | 1357 | int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg); |
| 1358 | 1358 | ||
| 1359 | max_bitmaps = max(max_bitmaps, 1); | ||
| 1360 | |||
| 1359 | BUG_ON(ctl->total_bitmaps > max_bitmaps); | 1361 | BUG_ON(ctl->total_bitmaps > max_bitmaps); |
| 1360 | 1362 | ||
| 1361 | /* | 1363 | /* |
| @@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl, | |||
| 1463 | } | 1465 | } |
| 1464 | 1466 | ||
| 1465 | static struct btrfs_free_space * | 1467 | static struct btrfs_free_space * |
| 1466 | find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) | 1468 | find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, |
| 1469 | unsigned long align) | ||
| 1467 | { | 1470 | { |
| 1468 | struct btrfs_free_space *entry; | 1471 | struct btrfs_free_space *entry; |
| 1469 | struct rb_node *node; | 1472 | struct rb_node *node; |
| 1473 | u64 ctl_off; | ||
| 1474 | u64 tmp; | ||
| 1475 | u64 align_off; | ||
| 1470 | int ret; | 1476 | int ret; |
| 1471 | 1477 | ||
| 1472 | if (!ctl->free_space_offset.rb_node) | 1478 | if (!ctl->free_space_offset.rb_node) |
| @@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes) | |||
| 1481 | if (entry->bytes < *bytes) | 1487 | if (entry->bytes < *bytes) |
| 1482 | continue; | 1488 | continue; |
| 1483 | 1489 | ||
| 1490 | /* make sure the space returned is big enough | ||
| 1491 | * to match our requested alignment | ||
| 1492 | */ | ||
| 1493 | if (*bytes >= align) { | ||
| 1494 | ctl_off = entry->offset - ctl->start; | ||
| 1495 | tmp = ctl_off + align - 1;; | ||
| 1496 | do_div(tmp, align); | ||
| 1497 | tmp = tmp * align + ctl->start; | ||
| 1498 | align_off = tmp - entry->offset; | ||
| 1499 | } else { | ||
| 1500 | align_off = 0; | ||
| 1501 | tmp = entry->offset; | ||
| 1502 | } | ||
| 1503 | |||
| 1504 | if (entry->bytes < *bytes + align_off) | ||
| 1505 | continue; | ||
| 1506 | |||
| 1484 | if (entry->bitmap) { | 1507 | if (entry->bitmap) { |
| 1485 | ret = search_bitmap(ctl, entry, offset, bytes); | 1508 | ret = search_bitmap(ctl, entry, &tmp, bytes); |
| 1486 | if (!ret) | 1509 | if (!ret) { |
| 1510 | *offset = tmp; | ||
| 1487 | return entry; | 1511 | return entry; |
| 1512 | } | ||
| 1488 | continue; | 1513 | continue; |
| 1489 | } | 1514 | } |
| 1490 | 1515 | ||
| 1491 | *offset = entry->offset; | 1516 | *offset = tmp; |
| 1492 | *bytes = entry->bytes; | 1517 | *bytes = entry->bytes - align_off; |
| 1493 | return entry; | 1518 | return entry; |
| 1494 | } | 1519 | } |
| 1495 | 1520 | ||
| @@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, | |||
| 1636 | } | 1661 | } |
| 1637 | 1662 | ||
| 1638 | /* | 1663 | /* |
| 1639 | * some block groups are so tiny they can't be enveloped by a bitmap, so | 1664 | * The original block groups from mkfs can be really small, like 8 |
| 1640 | * don't even bother to create a bitmap for this | 1665 | * megabytes, so don't bother with a bitmap for those entries. However |
| 1666 | * some block groups can be smaller than what a bitmap would cover but | ||
| 1667 | * are still large enough that they could overflow the 32k memory limit, | ||
| 1668 | * so allow those block groups to still be allowed to have a bitmap | ||
| 1669 | * entry. | ||
| 1641 | */ | 1670 | */ |
| 1642 | if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset) | 1671 | if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset) |
| 1643 | return false; | 1672 | return false; |
| 1644 | 1673 | ||
| 1645 | return true; | 1674 | return true; |
| @@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
| 2095 | struct btrfs_free_space *entry = NULL; | 2124 | struct btrfs_free_space *entry = NULL; |
| 2096 | u64 bytes_search = bytes + empty_size; | 2125 | u64 bytes_search = bytes + empty_size; |
| 2097 | u64 ret = 0; | 2126 | u64 ret = 0; |
| 2127 | u64 align_gap = 0; | ||
| 2128 | u64 align_gap_len = 0; | ||
| 2098 | 2129 | ||
| 2099 | spin_lock(&ctl->tree_lock); | 2130 | spin_lock(&ctl->tree_lock); |
| 2100 | entry = find_free_space(ctl, &offset, &bytes_search); | 2131 | entry = find_free_space(ctl, &offset, &bytes_search, |
| 2132 | block_group->full_stripe_len); | ||
| 2101 | if (!entry) | 2133 | if (!entry) |
| 2102 | goto out; | 2134 | goto out; |
| 2103 | 2135 | ||
| @@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
| 2107 | if (!entry->bytes) | 2139 | if (!entry->bytes) |
| 2108 | free_bitmap(ctl, entry); | 2140 | free_bitmap(ctl, entry); |
| 2109 | } else { | 2141 | } else { |
| 2142 | |||
| 2110 | unlink_free_space(ctl, entry); | 2143 | unlink_free_space(ctl, entry); |
| 2111 | entry->offset += bytes; | 2144 | align_gap_len = offset - entry->offset; |
| 2112 | entry->bytes -= bytes; | 2145 | align_gap = entry->offset; |
| 2146 | |||
| 2147 | entry->offset = offset + bytes; | ||
| 2148 | WARN_ON(entry->bytes < bytes + align_gap_len); | ||
| 2149 | |||
| 2150 | entry->bytes -= bytes + align_gap_len; | ||
| 2113 | if (!entry->bytes) | 2151 | if (!entry->bytes) |
| 2114 | kmem_cache_free(btrfs_free_space_cachep, entry); | 2152 | kmem_cache_free(btrfs_free_space_cachep, entry); |
| 2115 | else | 2153 | else |
| @@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, | |||
| 2119 | out: | 2157 | out: |
| 2120 | spin_unlock(&ctl->tree_lock); | 2158 | spin_unlock(&ctl->tree_lock); |
| 2121 | 2159 | ||
| 2160 | if (align_gap_len) | ||
| 2161 | __btrfs_add_free_space(ctl, align_gap, align_gap_len); | ||
| 2122 | return ret; | 2162 | return ret; |
| 2123 | } | 2163 | } |
| 2124 | 2164 | ||
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 55c07b650378..c226daefd65d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c | |||
| @@ -39,12 +39,13 @@ | |||
| 39 | #include <linux/slab.h> | 39 | #include <linux/slab.h> |
| 40 | #include <linux/ratelimit.h> | 40 | #include <linux/ratelimit.h> |
| 41 | #include <linux/mount.h> | 41 | #include <linux/mount.h> |
| 42 | #include <linux/btrfs.h> | ||
| 43 | #include <linux/blkdev.h> | ||
| 42 | #include "compat.h" | 44 | #include "compat.h" |
| 43 | #include "ctree.h" | 45 | #include "ctree.h" |
| 44 | #include "disk-io.h" | 46 | #include "disk-io.h" |
| 45 | #include "transaction.h" | 47 | #include "transaction.h" |
| 46 | #include "btrfs_inode.h" | 48 | #include "btrfs_inode.h" |
| 47 | #include "ioctl.h" | ||
| 48 | #include "print-tree.h" | 49 | #include "print-tree.h" |
| 49 | #include "ordered-data.h" | 50 | #include "ordered-data.h" |
| 50 | #include "xattr.h" | 51 | #include "xattr.h" |
| @@ -54,6 +55,7 @@ | |||
| 54 | #include "locking.h" | 55 | #include "locking.h" |
| 55 | #include "free-space-cache.h" | 56 | #include "free-space-cache.h" |
| 56 | #include "inode-map.h" | 57 | #include "inode-map.h" |
| 58 | #include "backref.h" | ||
| 57 | 59 | ||
| 58 | struct btrfs_iget_args { | 60 | struct btrfs_iget_args { |
| 59 | u64 ino; | 61 | u64 ino; |
| @@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, | |||
| 231 | u64 isize = i_size_read(inode); | 233 | u64 isize = i_size_read(inode); |
| 232 | u64 actual_end = min(end + 1, isize); | 234 | u64 actual_end = min(end + 1, isize); |
| 233 | u64 inline_len = actual_end - start; | 235 | u64 inline_len = actual_end - start; |
| 234 | u64 aligned_end = (end + root->sectorsize - 1) & | 236 | u64 aligned_end = ALIGN(end, root->sectorsize); |
| 235 | ~((u64)root->sectorsize - 1); | ||
| 236 | u64 data_len = inline_len; | 237 | u64 data_len = inline_len; |
| 237 | int ret; | 238 | int ret; |
| 238 | 239 | ||
| @@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, | |||
| 265 | return 1; | 266 | return 1; |
| 266 | } | 267 | } |
| 267 | 268 | ||
| 269 | set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); | ||
| 268 | btrfs_delalloc_release_metadata(inode, end + 1 - start); | 270 | btrfs_delalloc_release_metadata(inode, end + 1 - start); |
| 269 | btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); | 271 | btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); |
| 270 | return 0; | 272 | return 0; |
| @@ -389,7 +391,7 @@ again: | |||
| 389 | * a compressed extent to 128k. | 391 | * a compressed extent to 128k. |
| 390 | */ | 392 | */ |
| 391 | total_compressed = min(total_compressed, max_uncompressed); | 393 | total_compressed = min(total_compressed, max_uncompressed); |
| 392 | num_bytes = (end - start + blocksize) & ~(blocksize - 1); | 394 | num_bytes = ALIGN(end - start + 1, blocksize); |
| 393 | num_bytes = max(blocksize, num_bytes); | 395 | num_bytes = max(blocksize, num_bytes); |
| 394 | total_in = 0; | 396 | total_in = 0; |
| 395 | ret = 0; | 397 | ret = 0; |
| @@ -488,15 +490,13 @@ cont: | |||
| 488 | * up to a block size boundary so the allocator does sane | 490 | * up to a block size boundary so the allocator does sane |
| 489 | * things | 491 | * things |
| 490 | */ | 492 | */ |
| 491 | total_compressed = (total_compressed + blocksize - 1) & | 493 | total_compressed = ALIGN(total_compressed, blocksize); |
| 492 | ~(blocksize - 1); | ||
| 493 | 494 | ||
| 494 | /* | 495 | /* |
| 495 | * one last check to make sure the compression is really a | 496 | * one last check to make sure the compression is really a |
| 496 | * win, compare the page count read with the blocks on disk | 497 | * win, compare the page count read with the blocks on disk |
| 497 | */ | 498 | */ |
| 498 | total_in = (total_in + PAGE_CACHE_SIZE - 1) & | 499 | total_in = ALIGN(total_in, PAGE_CACHE_SIZE); |
| 499 | ~(PAGE_CACHE_SIZE - 1); | ||
| 500 | if (total_compressed >= total_in) { | 500 | if (total_compressed >= total_in) { |
| 501 | will_compress = 0; | 501 | will_compress = 0; |
| 502 | } else { | 502 | } else { |
| @@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode, | |||
| 608 | if (list_empty(&async_cow->extents)) | 608 | if (list_empty(&async_cow->extents)) |
| 609 | return 0; | 609 | return 0; |
| 610 | 610 | ||
| 611 | 611 | again: | |
| 612 | while (!list_empty(&async_cow->extents)) { | 612 | while (!list_empty(&async_cow->extents)) { |
| 613 | async_extent = list_entry(async_cow->extents.next, | 613 | async_extent = list_entry(async_cow->extents.next, |
| 614 | struct async_extent, list); | 614 | struct async_extent, list); |
| @@ -648,6 +648,8 @@ retry: | |||
| 648 | async_extent->ram_size - 1, | 648 | async_extent->ram_size - 1, |
| 649 | btrfs_get_extent, | 649 | btrfs_get_extent, |
| 650 | WB_SYNC_ALL); | 650 | WB_SYNC_ALL); |
| 651 | else if (ret) | ||
| 652 | unlock_page(async_cow->locked_page); | ||
| 651 | kfree(async_extent); | 653 | kfree(async_extent); |
| 652 | cond_resched(); | 654 | cond_resched(); |
| 653 | continue; | 655 | continue; |
| @@ -672,6 +674,7 @@ retry: | |||
| 672 | 674 | ||
| 673 | if (ret) { | 675 | if (ret) { |
| 674 | int i; | 676 | int i; |
| 677 | |||
| 675 | for (i = 0; i < async_extent->nr_pages; i++) { | 678 | for (i = 0; i < async_extent->nr_pages; i++) { |
| 676 | WARN_ON(async_extent->pages[i]->mapping); | 679 | WARN_ON(async_extent->pages[i]->mapping); |
| 677 | page_cache_release(async_extent->pages[i]); | 680 | page_cache_release(async_extent->pages[i]); |
| @@ -679,12 +682,10 @@ retry: | |||
| 679 | kfree(async_extent->pages); | 682 | kfree(async_extent->pages); |
| 680 | async_extent->nr_pages = 0; | 683 | async_extent->nr_pages = 0; |
| 681 | async_extent->pages = NULL; | 684 | async_extent->pages = NULL; |
| 682 | unlock_extent(io_tree, async_extent->start, | 685 | |
| 683 | async_extent->start + | ||
| 684 | async_extent->ram_size - 1); | ||
| 685 | if (ret == -ENOSPC) | 686 | if (ret == -ENOSPC) |
| 686 | goto retry; | 687 | goto retry; |
| 687 | goto out_free; /* JDM: Requeue? */ | 688 | goto out_free; |
| 688 | } | 689 | } |
| 689 | 690 | ||
| 690 | /* | 691 | /* |
| @@ -696,10 +697,13 @@ retry: | |||
| 696 | async_extent->ram_size - 1, 0); | 697 | async_extent->ram_size - 1, 0); |
| 697 | 698 | ||
| 698 | em = alloc_extent_map(); | 699 | em = alloc_extent_map(); |
| 699 | BUG_ON(!em); /* -ENOMEM */ | 700 | if (!em) |
| 701 | goto out_free_reserve; | ||
| 700 | em->start = async_extent->start; | 702 | em->start = async_extent->start; |
| 701 | em->len = async_extent->ram_size; | 703 | em->len = async_extent->ram_size; |
| 702 | em->orig_start = em->start; | 704 | em->orig_start = em->start; |
| 705 | em->mod_start = em->start; | ||
| 706 | em->mod_len = em->len; | ||
| 703 | 707 | ||
| 704 | em->block_start = ins.objectid; | 708 | em->block_start = ins.objectid; |
| 705 | em->block_len = ins.offset; | 709 | em->block_len = ins.offset; |
| @@ -726,6 +730,9 @@ retry: | |||
| 726 | async_extent->ram_size - 1, 0); | 730 | async_extent->ram_size - 1, 0); |
| 727 | } | 731 | } |
| 728 | 732 | ||
| 733 | if (ret) | ||
| 734 | goto out_free_reserve; | ||
| 735 | |||
| 729 | ret = btrfs_add_ordered_extent_compress(inode, | 736 | ret = btrfs_add_ordered_extent_compress(inode, |
| 730 | async_extent->start, | 737 | async_extent->start, |
| 731 | ins.objectid, | 738 | ins.objectid, |
| @@ -733,7 +740,8 @@ retry: | |||
| 733 | ins.offset, | 740 | ins.offset, |
| 734 | BTRFS_ORDERED_COMPRESSED, | 741 | BTRFS_ORDERED_COMPRESSED, |
| 735 | async_extent->compress_type); | 742 | async_extent->compress_type); |
| 736 | BUG_ON(ret); /* -ENOMEM */ | 743 | if (ret) |
| 744 | goto out_free_reserve; | ||
| 737 | 745 | ||
| 738 | /* | 746 | /* |
| 739 | * clear dirty, set writeback and unlock the pages. | 747 | * clear dirty, set writeback and unlock the pages. |
| @@ -754,18 +762,30 @@ retry: | |||
| 754 | ins.objectid, | 762 | ins.objectid, |
| 755 | ins.offset, async_extent->pages, | 763 | ins.offset, async_extent->pages, |
| 756 | async_extent->nr_pages); | 764 | async_extent->nr_pages); |
| 757 | |||
| 758 | BUG_ON(ret); /* -ENOMEM */ | ||
| 759 | alloc_hint = ins.objectid + ins.offset; | 765 | alloc_hint = ins.objectid + ins.offset; |
| 760 | kfree(async_extent); | 766 | kfree(async_extent); |
| 767 | if (ret) | ||
| 768 | goto out; | ||
| 761 | cond_resched(); | 769 | cond_resched(); |
| 762 | } | 770 | } |
| 763 | ret = 0; | 771 | ret = 0; |
| 764 | out: | 772 | out: |
| 765 | return ret; | 773 | return ret; |
| 774 | out_free_reserve: | ||
| 775 | btrfs_free_reserved_extent(root, ins.objectid, ins.offset); | ||
| 766 | out_free: | 776 | out_free: |
| 777 | extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, | ||
| 778 | async_extent->start, | ||
| 779 | async_extent->start + | ||
| 780 | async_extent->ram_size - 1, | ||
| 781 | NULL, EXTENT_CLEAR_UNLOCK_PAGE | | ||
| 782 | EXTENT_CLEAR_UNLOCK | | ||
| 783 | EXTENT_CLEAR_DELALLOC | | ||
| 784 | EXTENT_CLEAR_DIRTY | | ||
| 785 | EXTENT_SET_WRITEBACK | | ||
| 786 | EXTENT_END_WRITEBACK); | ||
| 767 | kfree(async_extent); | 787 | kfree(async_extent); |
| 768 | goto out; | 788 | goto again; |
| 769 | } | 789 | } |
| 770 | 790 | ||
| 771 | static u64 get_extent_allocation_hint(struct inode *inode, u64 start, | 791 | static u64 get_extent_allocation_hint(struct inode *inode, u64 start, |
| @@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, | |||
| 834 | 854 | ||
| 835 | BUG_ON(btrfs_is_free_space_inode(inode)); | 855 | BUG_ON(btrfs_is_free_space_inode(inode)); |
| 836 | 856 | ||
| 837 | num_bytes = (end - start + blocksize) & ~(blocksize - 1); | 857 | num_bytes = ALIGN(end - start + 1, blocksize); |
| 838 | num_bytes = max(blocksize, num_bytes); | 858 | num_bytes = max(blocksize, num_bytes); |
| 839 | disk_num_bytes = num_bytes; | 859 | disk_num_bytes = num_bytes; |
| 840 | 860 | ||
| @@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans, | |||
| 892 | em->orig_start = em->start; | 912 | em->orig_start = em->start; |
| 893 | ram_size = ins.offset; | 913 | ram_size = ins.offset; |
| 894 | em->len = ins.offset; | 914 | em->len = ins.offset; |
| 915 | em->mod_start = em->start; | ||
| 916 | em->mod_len = em->len; | ||
| 895 | 917 | ||
| 896 | em->block_start = ins.objectid; | 918 | em->block_start = ins.objectid; |
| 897 | em->block_len = ins.offset; | 919 | em->block_len = ins.offset; |
| @@ -1338,6 +1360,8 @@ out_check: | |||
| 1338 | em->block_start = disk_bytenr; | 1360 | em->block_start = disk_bytenr; |
| 1339 | em->orig_block_len = disk_num_bytes; | 1361 | em->orig_block_len = disk_num_bytes; |
| 1340 | em->bdev = root->fs_info->fs_devices->latest_bdev; | 1362 | em->bdev = root->fs_info->fs_devices->latest_bdev; |
| 1363 | em->mod_start = em->start; | ||
| 1364 | em->mod_len = em->len; | ||
| 1341 | set_bit(EXTENT_FLAG_PINNED, &em->flags); | 1365 | set_bit(EXTENT_FLAG_PINNED, &em->flags); |
| 1342 | set_bit(EXTENT_FLAG_FILLING, &em->flags); | 1366 | set_bit(EXTENT_FLAG_FILLING, &em->flags); |
| 1343 | em->generation = -1; | 1367 | em->generation = -1; |
| @@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode, | |||
| 1508 | spin_unlock(&BTRFS_I(inode)->lock); | 1532 | spin_unlock(&BTRFS_I(inode)->lock); |
| 1509 | } | 1533 | } |
| 1510 | 1534 | ||
| 1511 | spin_lock(&root->fs_info->delalloc_lock); | 1535 | __percpu_counter_add(&root->fs_info->delalloc_bytes, len, |
| 1536 | root->fs_info->delalloc_batch); | ||
| 1537 | spin_lock(&BTRFS_I(inode)->lock); | ||
| 1512 | BTRFS_I(inode)->delalloc_bytes += len; | 1538 | BTRFS_I(inode)->delalloc_bytes += len; |
| 1513 | root->fs_info->delalloc_bytes += len; | 1539 | if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, |
| 1514 | if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) { | 1540 | &BTRFS_I(inode)->runtime_flags)) { |
| 1515 | list_add_tail(&BTRFS_I(inode)->delalloc_inodes, | 1541 | spin_lock(&root->fs_info->delalloc_lock); |
| 1516 | &root->fs_info->delalloc_inodes); | 1542 | if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { |
| 1543 | list_add_tail(&BTRFS_I(inode)->delalloc_inodes, | ||
| 1544 | &root->fs_info->delalloc_inodes); | ||
| 1545 | set_bit(BTRFS_INODE_IN_DELALLOC_LIST, | ||
| 1546 | &BTRFS_I(inode)->runtime_flags); | ||
| 1547 | } | ||
| 1548 | spin_unlock(&root->fs_info->delalloc_lock); | ||
| 1517 | } | 1549 | } |
| 1518 | spin_unlock(&root->fs_info->delalloc_lock); | 1550 | spin_unlock(&BTRFS_I(inode)->lock); |
| 1519 | } | 1551 | } |
| 1520 | } | 1552 | } |
| 1521 | 1553 | ||
| @@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode, | |||
| 1550 | && do_list) | 1582 | && do_list) |
| 1551 | btrfs_free_reserved_data_space(inode, len); | 1583 | btrfs_free_reserved_data_space(inode, len); |
| 1552 | 1584 | ||
| 1553 | spin_lock(&root->fs_info->delalloc_lock); | 1585 | __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, |
| 1554 | root->fs_info->delalloc_bytes -= len; | 1586 | root->fs_info->delalloc_batch); |
| 1587 | spin_lock(&BTRFS_I(inode)->lock); | ||
| 1555 | BTRFS_I(inode)->delalloc_bytes -= len; | 1588 | BTRFS_I(inode)->delalloc_bytes -= len; |
| 1556 | |||
| 1557 | if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && | 1589 | if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 && |
| 1558 | !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { | 1590 | test_bit(BTRFS_INODE_IN_DELALLOC_LIST, |
| 1559 | list_del_init(&BTRFS_I(inode)->delalloc_inodes); | 1591 | &BTRFS_I(inode)->runtime_flags)) { |
| 1592 | spin_lock(&root->fs_info->delalloc_lock); | ||
| 1593 | if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) { | ||
| 1594 | list_del_init(&BTRFS_I(inode)->delalloc_inodes); | ||
| 1595 | clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, | ||
| 1596 | &BTRFS_I(inode)->runtime_flags); | ||
| 1597 | } | ||
| 1598 | spin_unlock(&root->fs_info->delalloc_lock); | ||
| 1560 | } | 1599 | } |
| 1561 | spin_unlock(&root->fs_info->delalloc_lock); | 1600 | spin_unlock(&BTRFS_I(inode)->lock); |
| 1562 | } | 1601 | } |
| 1563 | } | 1602 | } |
| 1564 | 1603 | ||
| @@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode, | |||
| 1566 | * extent_io.c merge_bio_hook, this must check the chunk tree to make sure | 1605 | * extent_io.c merge_bio_hook, this must check the chunk tree to make sure |
| 1567 | * we don't create bios that span stripes or chunks | 1606 | * we don't create bios that span stripes or chunks |
| 1568 | */ | 1607 | */ |
| 1569 | int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | 1608 | int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset, |
| 1570 | size_t size, struct bio *bio, | 1609 | size_t size, struct bio *bio, |
| 1571 | unsigned long bio_flags) | 1610 | unsigned long bio_flags) |
| 1572 | { | 1611 | { |
| @@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, | |||
| 1581 | 1620 | ||
| 1582 | length = bio->bi_size; | 1621 | length = bio->bi_size; |
| 1583 | map_length = length; | 1622 | map_length = length; |
| 1584 | ret = btrfs_map_block(root->fs_info, READ, logical, | 1623 | ret = btrfs_map_block(root->fs_info, rw, logical, |
| 1585 | &map_length, NULL, 0); | 1624 | &map_length, NULL, 0); |
| 1586 | /* Will always return 0 with map_multi == NULL */ | 1625 | /* Will always return 0 with map_multi == NULL */ |
| 1587 | BUG_ON(ret < 0); | 1626 | BUG_ON(ret < 0); |
| @@ -1892,6 +1931,640 @@ out: | |||
| 1892 | return ret; | 1931 | return ret; |
| 1893 | } | 1932 | } |
| 1894 | 1933 | ||
| 1934 | /* snapshot-aware defrag */ | ||
| 1935 | struct sa_defrag_extent_backref { | ||
| 1936 | struct rb_node node; | ||
| 1937 | struct old_sa_defrag_extent *old; | ||
| 1938 | u64 root_id; | ||
| 1939 | u64 inum; | ||
| 1940 | u64 file_pos; | ||
| 1941 | u64 extent_offset; | ||
| 1942 | u64 num_bytes; | ||
| 1943 | u64 generation; | ||
| 1944 | }; | ||
| 1945 | |||
| 1946 | struct old_sa_defrag_extent { | ||
| 1947 | struct list_head list; | ||
| 1948 | struct new_sa_defrag_extent *new; | ||
| 1949 | |||
| 1950 | u64 extent_offset; | ||
| 1951 | u64 bytenr; | ||
| 1952 | u64 offset; | ||
| 1953 | u64 len; | ||
| 1954 | int count; | ||
| 1955 | }; | ||
| 1956 | |||
| 1957 | struct new_sa_defrag_extent { | ||
| 1958 | struct rb_root root; | ||
| 1959 | struct list_head head; | ||
| 1960 | struct btrfs_path *path; | ||
| 1961 | struct inode *inode; | ||
| 1962 | u64 file_pos; | ||
| 1963 | u64 len; | ||
| 1964 | u64 bytenr; | ||
| 1965 | u64 disk_len; | ||
| 1966 | u8 compress_type; | ||
| 1967 | }; | ||
| 1968 | |||
| 1969 | static int backref_comp(struct sa_defrag_extent_backref *b1, | ||
| 1970 | struct sa_defrag_extent_backref *b2) | ||
| 1971 | { | ||
| 1972 | if (b1->root_id < b2->root_id) | ||
| 1973 | return -1; | ||
| 1974 | else if (b1->root_id > b2->root_id) | ||
| 1975 | return 1; | ||
| 1976 | |||
| 1977 | if (b1->inum < b2->inum) | ||
| 1978 | return -1; | ||
| 1979 | else if (b1->inum > b2->inum) | ||
| 1980 | return 1; | ||
| 1981 | |||
| 1982 | if (b1->file_pos < b2->file_pos) | ||
| 1983 | return -1; | ||
| 1984 | else if (b1->file_pos > b2->file_pos) | ||
| 1985 | return 1; | ||
| 1986 | |||
| 1987 | /* | ||
| 1988 | * [------------------------------] ===> (a range of space) | ||
| 1989 | * |<--->| |<---->| =============> (fs/file tree A) | ||
| 1990 | * |<---------------------------->| ===> (fs/file tree B) | ||
| 1991 | * | ||
| 1992 | * A range of space can refer to two file extents in one tree while | ||
| 1993 | * refer to only one file extent in another tree. | ||
| 1994 | * | ||
| 1995 | * So we may process a disk offset more than one time(two extents in A) | ||
| 1996 | * and locate at the same extent(one extent in B), then insert two same | ||
| 1997 | * backrefs(both refer to the extent in B). | ||
| 1998 | */ | ||
| 1999 | return 0; | ||
| 2000 | } | ||
| 2001 | |||
| 2002 | static void backref_insert(struct rb_root *root, | ||
| 2003 | struct sa_defrag_extent_backref *backref) | ||
| 2004 | { | ||
| 2005 | struct rb_node **p = &root->rb_node; | ||
| 2006 | struct rb_node *parent = NULL; | ||
| 2007 | struct sa_defrag_extent_backref *entry; | ||
| 2008 | int ret; | ||
| 2009 | |||
| 2010 | while (*p) { | ||
| 2011 | parent = *p; | ||
| 2012 | entry = rb_entry(parent, struct sa_defrag_extent_backref, node); | ||
| 2013 | |||
| 2014 | ret = backref_comp(backref, entry); | ||
| 2015 | if (ret < 0) | ||
| 2016 | p = &(*p)->rb_left; | ||
| 2017 | else | ||
| 2018 | p = &(*p)->rb_right; | ||
| 2019 | } | ||
| 2020 | |||
| 2021 | rb_link_node(&backref->node, parent, p); | ||
| 2022 | rb_insert_color(&backref->node, root); | ||
| 2023 | } | ||
| 2024 | |||
| 2025 | /* | ||
| 2026 | * Note the backref might has changed, and in this case we just return 0. | ||
| 2027 | */ | ||
| 2028 | static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, | ||
| 2029 | void *ctx) | ||
| 2030 | { | ||
| 2031 | struct btrfs_file_extent_item *extent; | ||
| 2032 | struct btrfs_fs_info *fs_info; | ||
| 2033 | struct old_sa_defrag_extent *old = ctx; | ||
| 2034 | struct new_sa_defrag_extent *new = old->new; | ||
| 2035 | struct btrfs_path *path = new->path; | ||
| 2036 | struct btrfs_key key; | ||
| 2037 | struct btrfs_root *root; | ||
| 2038 | struct sa_defrag_extent_backref *backref; | ||
| 2039 | struct extent_buffer *leaf; | ||
| 2040 | struct inode *inode = new->inode; | ||
| 2041 | int slot; | ||
| 2042 | int ret; | ||
| 2043 | u64 extent_offset; | ||
| 2044 | u64 num_bytes; | ||
| 2045 | |||
| 2046 | if (BTRFS_I(inode)->root->root_key.objectid == root_id && | ||
| 2047 | inum == btrfs_ino(inode)) | ||
| 2048 | return 0; | ||
| 2049 | |||
| 2050 | key.objectid = root_id; | ||
| 2051 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
| 2052 | key.offset = (u64)-1; | ||
| 2053 | |||
| 2054 | fs_info = BTRFS_I(inode)->root->fs_info; | ||
| 2055 | root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
| 2056 | if (IS_ERR(root)) { | ||
| 2057 | if (PTR_ERR(root) == -ENOENT) | ||
| 2058 | return 0; | ||
| 2059 | WARN_ON(1); | ||
| 2060 | pr_debug("inum=%llu, offset=%llu, root_id=%llu\n", | ||
| 2061 | inum, offset, root_id); | ||
| 2062 | return PTR_ERR(root); | ||
| 2063 | } | ||
| 2064 | |||
| 2065 | key.objectid = inum; | ||
| 2066 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
| 2067 | if (offset > (u64)-1 << 32) | ||
| 2068 | key.offset = 0; | ||
| 2069 | else | ||
| 2070 | key.offset = offset; | ||
| 2071 | |||
| 2072 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
| 2073 | if (ret < 0) { | ||
| 2074 | WARN_ON(1); | ||
| 2075 | return ret; | ||
| 2076 | } | ||
| 2077 | |||
| 2078 | while (1) { | ||
| 2079 | cond_resched(); | ||
| 2080 | |||
| 2081 | leaf = path->nodes[0]; | ||
| 2082 | slot = path->slots[0]; | ||
| 2083 | |||
| 2084 | if (slot >= btrfs_header_nritems(leaf)) { | ||
| 2085 | ret = btrfs_next_leaf(root, path); | ||
| 2086 | if (ret < 0) { | ||
| 2087 | goto out; | ||
| 2088 | } else if (ret > 0) { | ||
| 2089 | ret = 0; | ||
| 2090 | goto out; | ||
| 2091 | } | ||
| 2092 | continue; | ||
| 2093 | } | ||
| 2094 | |||
| 2095 | path->slots[0]++; | ||
| 2096 | |||
| 2097 | btrfs_item_key_to_cpu(leaf, &key, slot); | ||
| 2098 | |||
| 2099 | if (key.objectid > inum) | ||
| 2100 | goto out; | ||
| 2101 | |||
| 2102 | if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) | ||
| 2103 | continue; | ||
| 2104 | |||
| 2105 | extent = btrfs_item_ptr(leaf, slot, | ||
| 2106 | struct btrfs_file_extent_item); | ||
| 2107 | |||
| 2108 | if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) | ||
| 2109 | continue; | ||
| 2110 | |||
| 2111 | extent_offset = btrfs_file_extent_offset(leaf, extent); | ||
| 2112 | if (key.offset - extent_offset != offset) | ||
| 2113 | continue; | ||
| 2114 | |||
| 2115 | num_bytes = btrfs_file_extent_num_bytes(leaf, extent); | ||
| 2116 | if (extent_offset >= old->extent_offset + old->offset + | ||
| 2117 | old->len || extent_offset + num_bytes <= | ||
| 2118 | old->extent_offset + old->offset) | ||
| 2119 | continue; | ||
| 2120 | |||
| 2121 | break; | ||
| 2122 | } | ||
| 2123 | |||
| 2124 | backref = kmalloc(sizeof(*backref), GFP_NOFS); | ||
| 2125 | if (!backref) { | ||
| 2126 | ret = -ENOENT; | ||
| 2127 | goto out; | ||
| 2128 | } | ||
| 2129 | |||
| 2130 | backref->root_id = root_id; | ||
| 2131 | backref->inum = inum; | ||
| 2132 | backref->file_pos = offset + extent_offset; | ||
| 2133 | backref->num_bytes = num_bytes; | ||
| 2134 | backref->extent_offset = extent_offset; | ||
| 2135 | backref->generation = btrfs_file_extent_generation(leaf, extent); | ||
| 2136 | backref->old = old; | ||
| 2137 | backref_insert(&new->root, backref); | ||
| 2138 | old->count++; | ||
| 2139 | out: | ||
| 2140 | btrfs_release_path(path); | ||
| 2141 | WARN_ON(ret); | ||
| 2142 | return ret; | ||
| 2143 | } | ||
| 2144 | |||
| 2145 | static noinline bool record_extent_backrefs(struct btrfs_path *path, | ||
| 2146 | struct new_sa_defrag_extent *new) | ||
| 2147 | { | ||
| 2148 | struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info; | ||
| 2149 | struct old_sa_defrag_extent *old, *tmp; | ||
| 2150 | int ret; | ||
| 2151 | |||
| 2152 | new->path = path; | ||
| 2153 | |||
| 2154 | list_for_each_entry_safe(old, tmp, &new->head, list) { | ||
| 2155 | ret = iterate_inodes_from_logical(old->bytenr, fs_info, | ||
| 2156 | path, record_one_backref, | ||
| 2157 | old); | ||
| 2158 | BUG_ON(ret < 0 && ret != -ENOENT); | ||
| 2159 | |||
| 2160 | /* no backref to be processed for this extent */ | ||
| 2161 | if (!old->count) { | ||
| 2162 | list_del(&old->list); | ||
| 2163 | kfree(old); | ||
| 2164 | } | ||
| 2165 | } | ||
| 2166 | |||
| 2167 | if (list_empty(&new->head)) | ||
| 2168 | return false; | ||
| 2169 | |||
| 2170 | return true; | ||
| 2171 | } | ||
| 2172 | |||
| 2173 | static int relink_is_mergable(struct extent_buffer *leaf, | ||
| 2174 | struct btrfs_file_extent_item *fi, | ||
| 2175 | u64 disk_bytenr) | ||
| 2176 | { | ||
| 2177 | if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr) | ||
| 2178 | return 0; | ||
| 2179 | |||
| 2180 | if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) | ||
| 2181 | return 0; | ||
| 2182 | |||
| 2183 | if (btrfs_file_extent_compression(leaf, fi) || | ||
| 2184 | btrfs_file_extent_encryption(leaf, fi) || | ||
| 2185 | btrfs_file_extent_other_encoding(leaf, fi)) | ||
| 2186 | return 0; | ||
| 2187 | |||
| 2188 | return 1; | ||
| 2189 | } | ||
| 2190 | |||
| 2191 | /* | ||
| 2192 | * Note the backref might has changed, and in this case we just return 0. | ||
| 2193 | */ | ||
| 2194 | static noinline int relink_extent_backref(struct btrfs_path *path, | ||
| 2195 | struct sa_defrag_extent_backref *prev, | ||
| 2196 | struct sa_defrag_extent_backref *backref) | ||
| 2197 | { | ||
| 2198 | struct btrfs_file_extent_item *extent; | ||
| 2199 | struct btrfs_file_extent_item *item; | ||
| 2200 | struct btrfs_ordered_extent *ordered; | ||
| 2201 | struct btrfs_trans_handle *trans; | ||
| 2202 | struct btrfs_fs_info *fs_info; | ||
| 2203 | struct btrfs_root *root; | ||
| 2204 | struct btrfs_key key; | ||
| 2205 | struct extent_buffer *leaf; | ||
| 2206 | struct old_sa_defrag_extent *old = backref->old; | ||
| 2207 | struct new_sa_defrag_extent *new = old->new; | ||
| 2208 | struct inode *src_inode = new->inode; | ||
| 2209 | struct inode *inode; | ||
| 2210 | struct extent_state *cached = NULL; | ||
| 2211 | int ret = 0; | ||
| 2212 | u64 start; | ||
| 2213 | u64 len; | ||
| 2214 | u64 lock_start; | ||
| 2215 | u64 lock_end; | ||
| 2216 | bool merge = false; | ||
| 2217 | int index; | ||
| 2218 | |||
| 2219 | if (prev && prev->root_id == backref->root_id && | ||
| 2220 | prev->inum == backref->inum && | ||
| 2221 | prev->file_pos + prev->num_bytes == backref->file_pos) | ||
| 2222 | merge = true; | ||
| 2223 | |||
| 2224 | /* step 1: get root */ | ||
| 2225 | key.objectid = backref->root_id; | ||
| 2226 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
| 2227 | key.offset = (u64)-1; | ||
| 2228 | |||
| 2229 | fs_info = BTRFS_I(src_inode)->root->fs_info; | ||
| 2230 | index = srcu_read_lock(&fs_info->subvol_srcu); | ||
| 2231 | |||
| 2232 | root = btrfs_read_fs_root_no_name(fs_info, &key); | ||
| 2233 | if (IS_ERR(root)) { | ||
| 2234 | srcu_read_unlock(&fs_info->subvol_srcu, index); | ||
| 2235 | if (PTR_ERR(root) == -ENOENT) | ||
| 2236 | return 0; | ||
| 2237 | return PTR_ERR(root); | ||
| 2238 | } | ||
| 2239 | if (btrfs_root_refs(&root->root_item) == 0) { | ||
| 2240 | srcu_read_unlock(&fs_info->subvol_srcu, index); | ||
| 2241 | /* parse ENOENT to 0 */ | ||
| 2242 | return 0; | ||
| 2243 | } | ||
| 2244 | |||
| 2245 | /* step 2: get inode */ | ||
| 2246 | key.objectid = backref->inum; | ||
| 2247 | key.type = BTRFS_INODE_ITEM_KEY; | ||
| 2248 | key.offset = 0; | ||
| 2249 | |||
| 2250 | inode = btrfs_iget(fs_info->sb, &key, root, NULL); | ||
| 2251 | if (IS_ERR(inode)) { | ||
| 2252 | srcu_read_unlock(&fs_info->subvol_srcu, index); | ||
| 2253 | return 0; | ||
| 2254 | } | ||
| 2255 | |||
| 2256 | srcu_read_unlock(&fs_info->subvol_srcu, index); | ||
| 2257 | |||
| 2258 | /* step 3: relink backref */ | ||
| 2259 | lock_start = backref->file_pos; | ||
| 2260 | lock_end = backref->file_pos + backref->num_bytes - 1; | ||
| 2261 | lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, | ||
| 2262 | 0, &cached); | ||
| 2263 | |||
| 2264 | ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); | ||
| 2265 | if (ordered) { | ||
| 2266 | btrfs_put_ordered_extent(ordered); | ||
| 2267 | goto out_unlock; | ||
| 2268 | } | ||
| 2269 | |||
| 2270 | trans = btrfs_join_transaction(root); | ||
| 2271 | if (IS_ERR(trans)) { | ||
| 2272 | ret = PTR_ERR(trans); | ||
| 2273 | goto out_unlock; | ||
| 2274 | } | ||
| 2275 | |||
| 2276 | key.objectid = backref->inum; | ||
| 2277 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
| 2278 | key.offset = backref->file_pos; | ||
| 2279 | |||
| 2280 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
| 2281 | if (ret < 0) { | ||
| 2282 | goto out_free_path; | ||
| 2283 | } else if (ret > 0) { | ||
| 2284 | ret = 0; | ||
| 2285 | goto out_free_path; | ||
| 2286 | } | ||
| 2287 | |||
| 2288 | extent = btrfs_item_ptr(path->nodes[0], path->slots[0], | ||
| 2289 | struct btrfs_file_extent_item); | ||
| 2290 | |||
| 2291 | if (btrfs_file_extent_generation(path->nodes[0], extent) != | ||
| 2292 | backref->generation) | ||
| 2293 | goto out_free_path; | ||
| 2294 | |||
| 2295 | btrfs_release_path(path); | ||
| 2296 | |||
| 2297 | start = backref->file_pos; | ||
| 2298 | if (backref->extent_offset < old->extent_offset + old->offset) | ||
| 2299 | start += old->extent_offset + old->offset - | ||
| 2300 | backref->extent_offset; | ||
| 2301 | |||
| 2302 | len = min(backref->extent_offset + backref->num_bytes, | ||
| 2303 | old->extent_offset + old->offset + old->len); | ||
| 2304 | len -= max(backref->extent_offset, old->extent_offset + old->offset); | ||
| 2305 | |||
| 2306 | ret = btrfs_drop_extents(trans, root, inode, start, | ||
| 2307 | start + len, 1); | ||
| 2308 | if (ret) | ||
| 2309 | goto out_free_path; | ||
| 2310 | again: | ||
| 2311 | key.objectid = btrfs_ino(inode); | ||
| 2312 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
| 2313 | key.offset = start; | ||
| 2314 | |||
| 2315 | if (merge) { | ||
| 2316 | struct btrfs_file_extent_item *fi; | ||
| 2317 | u64 extent_len; | ||
| 2318 | struct btrfs_key found_key; | ||
| 2319 | |||
| 2320 | ret = btrfs_search_slot(trans, root, &key, path, 1, 1); | ||
| 2321 | if (ret < 0) | ||
| 2322 | goto out_free_path; | ||
| 2323 | |||
| 2324 | path->slots[0]--; | ||
| 2325 | leaf = path->nodes[0]; | ||
| 2326 | btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); | ||
| 2327 | |||
| 2328 | fi = btrfs_item_ptr(leaf, path->slots[0], | ||
| 2329 | struct btrfs_file_extent_item); | ||
| 2330 | extent_len = btrfs_file_extent_num_bytes(leaf, fi); | ||
| 2331 | |||
| 2332 | if (relink_is_mergable(leaf, fi, new->bytenr) && | ||
| 2333 | extent_len + found_key.offset == start) { | ||
| 2334 | btrfs_set_file_extent_num_bytes(leaf, fi, | ||
| 2335 | extent_len + len); | ||
| 2336 | btrfs_mark_buffer_dirty(leaf); | ||
| 2337 | inode_add_bytes(inode, len); | ||
| 2338 | |||
| 2339 | ret = 1; | ||
| 2340 | goto out_free_path; | ||
| 2341 | } else { | ||
| 2342 | merge = false; | ||
| 2343 | btrfs_release_path(path); | ||
| 2344 | goto again; | ||
| 2345 | } | ||
| 2346 | } | ||
| 2347 | |||
| 2348 | ret = btrfs_insert_empty_item(trans, root, path, &key, | ||
| 2349 | sizeof(*extent)); | ||
| 2350 | if (ret) { | ||
| 2351 | btrfs_abort_transaction(trans, root, ret); | ||
| 2352 | goto out_free_path; | ||
| 2353 | } | ||
| 2354 | |||
| 2355 | leaf = path->nodes[0]; | ||
| 2356 | item = btrfs_item_ptr(leaf, path->slots[0], | ||
| 2357 | struct btrfs_file_extent_item); | ||
| 2358 | btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); | ||
| 2359 | btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); | ||
| 2360 | btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); | ||
| 2361 | btrfs_set_file_extent_num_bytes(leaf, item, len); | ||
| 2362 | btrfs_set_file_extent_ram_bytes(leaf, item, new->len); | ||
| 2363 | btrfs_set_file_extent_generation(leaf, item, trans->transid); | ||
| 2364 | btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); | ||
| 2365 | btrfs_set_file_extent_compression(leaf, item, new->compress_type); | ||
| 2366 | btrfs_set_file_extent_encryption(leaf, item, 0); | ||
| 2367 | btrfs_set_file_extent_other_encoding(leaf, item, 0); | ||
| 2368 | |||
| 2369 | btrfs_mark_buffer_dirty(leaf); | ||
| 2370 | inode_add_bytes(inode, len); | ||
| 2371 | |||
| 2372 | ret = btrfs_inc_extent_ref(trans, root, new->bytenr, | ||
| 2373 | new->disk_len, 0, | ||
| 2374 | backref->root_id, backref->inum, | ||
| 2375 | new->file_pos, 0); /* start - extent_offset */ | ||
| 2376 | if (ret) { | ||
| 2377 | btrfs_abort_transaction(trans, root, ret); | ||
| 2378 | goto out_free_path; | ||
| 2379 | } | ||
| 2380 | |||
| 2381 | ret = 1; | ||
| 2382 | out_free_path: | ||
| 2383 | btrfs_release_path(path); | ||
| 2384 | btrfs_end_transaction(trans, root); | ||
| 2385 | out_unlock: | ||
| 2386 | unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, | ||
| 2387 | &cached, GFP_NOFS); | ||
| 2388 | iput(inode); | ||
| 2389 | return ret; | ||
| 2390 | } | ||
| 2391 | |||
| 2392 | static void relink_file_extents(struct new_sa_defrag_extent *new) | ||
| 2393 | { | ||
| 2394 | struct btrfs_path *path; | ||
| 2395 | struct old_sa_defrag_extent *old, *tmp; | ||
| 2396 | struct sa_defrag_extent_backref *backref; | ||
| 2397 | struct sa_defrag_extent_backref *prev = NULL; | ||
| 2398 | struct inode *inode; | ||
| 2399 | struct btrfs_root *root; | ||
| 2400 | struct rb_node *node; | ||
| 2401 | int ret; | ||
| 2402 | |||
| 2403 | inode = new->inode; | ||
| 2404 | root = BTRFS_I(inode)->root; | ||
| 2405 | |||
| 2406 | path = btrfs_alloc_path(); | ||
| 2407 | if (!path) | ||
| 2408 | return; | ||
| 2409 | |||
| 2410 | if (!record_extent_backrefs(path, new)) { | ||
| 2411 | btrfs_free_path(path); | ||
| 2412 | goto out; | ||
| 2413 | } | ||
| 2414 | btrfs_release_path(path); | ||
| 2415 | |||
| 2416 | while (1) { | ||
| 2417 | node = rb_first(&new->root); | ||
| 2418 | if (!node) | ||
| 2419 | break; | ||
| 2420 | rb_erase(node, &new->root); | ||
| 2421 | |||
| 2422 | backref = rb_entry(node, struct sa_defrag_extent_backref, node); | ||
| 2423 | |||
| 2424 | ret = relink_extent_backref(path, prev, backref); | ||
| 2425 | WARN_ON(ret < 0); | ||
| 2426 | |||
| 2427 | kfree(prev); | ||
| 2428 | |||
| 2429 | if (ret == 1) | ||
| 2430 | prev = backref; | ||
| 2431 | else | ||
| 2432 | prev = NULL; | ||
| 2433 | cond_resched(); | ||
| 2434 | } | ||
| 2435 | kfree(prev); | ||
| 2436 | |||
| 2437 | btrfs_free_path(path); | ||
| 2438 | |||
| 2439 | list_for_each_entry_safe(old, tmp, &new->head, list) { | ||
| 2440 | list_del(&old->list); | ||
| 2441 | kfree(old); | ||
| 2442 | } | ||
| 2443 | out: | ||
| 2444 | atomic_dec(&root->fs_info->defrag_running); | ||
| 2445 | wake_up(&root->fs_info->transaction_wait); | ||
| 2446 | |||
| 2447 | kfree(new); | ||
| 2448 | } | ||
| 2449 | |||
| 2450 | static struct new_sa_defrag_extent * | ||
| 2451 | record_old_file_extents(struct inode *inode, | ||
| 2452 | struct btrfs_ordered_extent *ordered) | ||
| 2453 | { | ||
| 2454 | struct btrfs_root *root = BTRFS_I(inode)->root; | ||
| 2455 | struct btrfs_path *path; | ||
| 2456 | struct btrfs_key key; | ||
| 2457 | struct old_sa_defrag_extent *old, *tmp; | ||
| 2458 | struct new_sa_defrag_extent *new; | ||
| 2459 | int ret; | ||
| 2460 | |||
| 2461 | new = kmalloc(sizeof(*new), GFP_NOFS); | ||
| 2462 | if (!new) | ||
| 2463 | return NULL; | ||
| 2464 | |||
| 2465 | new->inode = inode; | ||
| 2466 | new->file_pos = ordered->file_offset; | ||
| 2467 | new->len = ordered->len; | ||
| 2468 | new->bytenr = ordered->start; | ||
| 2469 | new->disk_len = ordered->disk_len; | ||
| 2470 | new->compress_type = ordered->compress_type; | ||
| 2471 | new->root = RB_ROOT; | ||
| 2472 | INIT_LIST_HEAD(&new->head); | ||
| 2473 | |||
| 2474 | path = btrfs_alloc_path(); | ||
| 2475 | if (!path) | ||
| 2476 | goto out_kfree; | ||
| 2477 | |||
| 2478 | key.objectid = btrfs_ino(inode); | ||
| 2479 | key.type = BTRFS_EXTENT_DATA_KEY; | ||
| 2480 | key.offset = new->file_pos; | ||
| 2481 | |||
| 2482 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
| 2483 | if (ret < 0) | ||
| 2484 | goto out_free_path; | ||
| 2485 | if (ret > 0 && path->slots[0] > 0) | ||
| 2486 | path->slots[0]--; | ||
| 2487 | |||
| 2488 | /* find out all the old extents for the file range */ | ||
| 2489 | while (1) { | ||
| 2490 | struct btrfs_file_extent_item *extent; | ||
| 2491 | struct extent_buffer *l; | ||
| 2492 | int slot; | ||
| 2493 | u64 num_bytes; | ||
| 2494 | u64 offset; | ||
| 2495 | u64 end; | ||
| 2496 | u64 disk_bytenr; | ||
| 2497 | u64 extent_offset; | ||
| 2498 | |||
| 2499 | l = path->nodes[0]; | ||
| 2500 | slot = path->slots[0]; | ||
| 2501 | |||
| 2502 | if (slot >= btrfs_header_nritems(l)) { | ||
| 2503 | ret = btrfs_next_leaf(root, path); | ||
| 2504 | if (ret < 0) | ||
| 2505 | goto out_free_list; | ||
| 2506 | else if (ret > 0) | ||
| 2507 | break; | ||
| 2508 | continue; | ||
| 2509 | } | ||
| 2510 | |||
| 2511 | btrfs_item_key_to_cpu(l, &key, slot); | ||
| 2512 | |||
| 2513 | if (key.objectid != btrfs_ino(inode)) | ||
| 2514 | break; | ||
| 2515 | if (key.type != BTRFS_EXTENT_DATA_KEY) | ||
| 2516 | break; | ||
| 2517 | if (key.offset >= new->file_pos + new->len) | ||
| 2518 | break; | ||
| 2519 | |||
| 2520 | extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); | ||
| 2521 | |||
| 2522 | num_bytes = btrfs_file_extent_num_bytes(l, extent); | ||
| 2523 | if (key.offset + num_bytes < new->file_pos) | ||
| 2524 | goto next; | ||
| 2525 | |||
| 2526 | disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); | ||
| 2527 | if (!disk_bytenr) | ||
| 2528 | goto next; | ||
| 2529 | |||
| 2530 | extent_offset = btrfs_file_extent_offset(l, extent); | ||
| 2531 | |||
| 2532 | old = kmalloc(sizeof(*old), GFP_NOFS); | ||
| 2533 | if (!old) | ||
| 2534 | goto out_free_list; | ||
| 2535 | |||
| 2536 | offset = max(new->file_pos, key.offset); | ||
| 2537 | end = min(new->file_pos + new->len, key.offset + num_bytes); | ||
| 2538 | |||
| 2539 | old->bytenr = disk_bytenr; | ||
| 2540 | old->extent_offset = extent_offset; | ||
| 2541 | old->offset = offset - key.offset; | ||
| 2542 | old->len = end - offset; | ||
| 2543 | old->new = new; | ||
| 2544 | old->count = 0; | ||
| 2545 | list_add_tail(&old->list, &new->head); | ||
| 2546 | next: | ||
| 2547 | path->slots[0]++; | ||
| 2548 | cond_resched(); | ||
| 2549 | } | ||
| 2550 | |||
| 2551 | btrfs_free_path(path); | ||
| 2552 | atomic_inc(&root->fs_info->defrag_running); | ||
| 2553 | |||
| 2554 | return new; | ||
| 2555 | |||
| 2556 | out_free_list: | ||
| 2557 | list_for_each_entry_safe(old, tmp, &new->head, list) { | ||
| 2558 | list_del(&old->list); | ||
| 2559 | kfree(old); | ||
| 2560 | } | ||
| 2561 | out_free_path: | ||
| 2562 | btrfs_free_path(path); | ||
| 2563 | out_kfree: | ||
| 2564 | kfree(new); | ||
| 2565 | return NULL; | ||
| 2566 | } | ||
| 2567 | |||
| 1895 | /* | 2568 | /* |
| 1896 | * helper function for btrfs_finish_ordered_io, this | 2569 | * helper function for btrfs_finish_ordered_io, this |
| 1897 | * just reads in some of the csum leaves to prime them into ram | 2570 | * just reads in some of the csum leaves to prime them into ram |
| @@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) | |||
| 1909 | struct btrfs_trans_handle *trans = NULL; | 2582 | struct btrfs_trans_handle *trans = NULL; |
| 1910 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | 2583 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; |
| 1911 | struct extent_state *cached_state = NULL; | 2584 | struct extent_state *cached_state = NULL; |
| 2585 | struct new_sa_defrag_extent *new = NULL; | ||
| 1912 | int compress_type = 0; | 2586 | int compress_type = 0; |
| 1913 | int ret; | 2587 | int ret; |
| 1914 | bool nolock; | 2588 | bool nolock; |
| @@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) | |||
| 1943 | ordered_extent->file_offset + ordered_extent->len - 1, | 2617 | ordered_extent->file_offset + ordered_extent->len - 1, |
| 1944 | 0, &cached_state); | 2618 | 0, &cached_state); |
| 1945 | 2619 | ||
| 2620 | ret = test_range_bit(io_tree, ordered_extent->file_offset, | ||
| 2621 | ordered_extent->file_offset + ordered_extent->len - 1, | ||
| 2622 | EXTENT_DEFRAG, 1, cached_state); | ||
| 2623 | if (ret) { | ||
| 2624 | u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); | ||
| 2625 | if (last_snapshot >= BTRFS_I(inode)->generation) | ||
| 2626 | /* the inode is shared */ | ||
| 2627 | new = record_old_file_extents(inode, ordered_extent); | ||
| 2628 | |||
| 2629 | clear_extent_bit(io_tree, ordered_extent->file_offset, | ||
| 2630 | ordered_extent->file_offset + ordered_extent->len - 1, | ||
| 2631 | EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS); | ||
| 2632 | } | ||
| 2633 | |||
| 1946 | if (nolock) | 2634 | if (nolock) |
| 1947 | trans = btrfs_join_transaction_nolock(root); | 2635 | trans = btrfs_join_transaction_nolock(root); |
| 1948 | else | 2636 | else |
| @@ -2001,17 +2689,33 @@ out: | |||
| 2001 | if (trans) | 2689 | if (trans) |
| 2002 | btrfs_end_transaction(trans, root); | 2690 | btrfs_end_transaction(trans, root); |
| 2003 | 2691 | ||
| 2004 | if (ret) | 2692 | if (ret) { |
| 2005 | clear_extent_uptodate(io_tree, ordered_extent->file_offset, | 2693 | clear_extent_uptodate(io_tree, ordered_extent->file_offset, |
| 2006 | ordered_extent->file_offset + | 2694 | ordered_extent->file_offset + |
| 2007 | ordered_extent->len - 1, NULL, GFP_NOFS); | 2695 | ordered_extent->len - 1, NULL, GFP_NOFS); |
| 2008 | 2696 | ||
| 2697 | /* | ||
| 2698 | * If the ordered extent had an IOERR or something else went | ||
| 2699 | * wrong we need to return the space for this ordered extent | ||
| 2700 | * back to the allocator. | ||
| 2701 | */ | ||
| 2702 | if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && | ||
| 2703 | !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) | ||
| 2704 | btrfs_free_reserved_extent(root, ordered_extent->start, | ||
| 2705 | ordered_extent->disk_len); | ||
| 2706 | } | ||
| 2707 | |||
| 2708 | |||
| 2009 | /* | 2709 | /* |
| 2010 | * This needs to be done to make sure anybody waiting knows we are done | 2710 | * This needs to be done to make sure anybody waiting knows we are done |
| 2011 | * updating everything for this ordered extent. | 2711 | * updating everything for this ordered extent. |
| 2012 | */ | 2712 | */ |
| 2013 | btrfs_remove_ordered_extent(inode, ordered_extent); | 2713 | btrfs_remove_ordered_extent(inode, ordered_extent); |
| 2014 | 2714 | ||
| 2715 | /* for snapshot-aware defrag */ | ||
| 2716 | if (new) | ||
| 2717 | relink_file_extents(new); | ||
| 2718 | |||
| 2015 | /* once for us */ | 2719 | /* once for us */ |
| 2016 | btrfs_put_ordered_extent(ordered_extent); | 2720 | btrfs_put_ordered_extent(ordered_extent); |
| 2017 | /* once for the tree */ | 2721 | /* once for the tree */ |
| @@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, | |||
| 2062 | static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, | 2766 | static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, |
| 2063 | struct extent_state *state, int mirror) | 2767 | struct extent_state *state, int mirror) |
| 2064 | { | 2768 | { |
| 2065 | size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); | 2769 | size_t offset = start - page_offset(page); |
| 2066 | struct inode *inode = page->mapping->host; | 2770 | struct inode *inode = page->mapping->host; |
| 2067 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; | 2771 | struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; |
| 2068 | char *kaddr; | 2772 | char *kaddr; |
| @@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) | |||
| 2167 | } | 2871 | } |
| 2168 | } | 2872 | } |
| 2169 | 2873 | ||
| 2170 | enum btrfs_orphan_cleanup_state { | ||
| 2171 | ORPHAN_CLEANUP_STARTED = 1, | ||
| 2172 | ORPHAN_CLEANUP_DONE = 2, | ||
| 2173 | }; | ||
| 2174 | |||
| 2175 | /* | 2874 | /* |
| 2176 | * This is called in transaction commit time. If there are no orphan | 2875 | * This is called in transaction commit time. If there are no orphan |
| 2177 | * files in the subvolume, it removes orphan item and frees block_rsv | 2876 | * files in the subvolume, it removes orphan item and frees block_rsv |
| @@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
| 2469 | */ | 3168 | */ |
| 2470 | set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, | 3169 | set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, |
| 2471 | &BTRFS_I(inode)->runtime_flags); | 3170 | &BTRFS_I(inode)->runtime_flags); |
| 3171 | atomic_inc(&root->orphan_inodes); | ||
| 2472 | 3172 | ||
| 2473 | /* if we have links, this was a truncate, lets do that */ | 3173 | /* if we have links, this was a truncate, lets do that */ |
| 2474 | if (inode->i_nlink) { | 3174 | if (inode->i_nlink) { |
| @@ -2491,6 +3191,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) | |||
| 2491 | goto out; | 3191 | goto out; |
| 2492 | 3192 | ||
| 2493 | ret = btrfs_truncate(inode); | 3193 | ret = btrfs_truncate(inode); |
| 3194 | if (ret) | ||
| 3195 | btrfs_orphan_del(NULL, inode); | ||
| 2494 | } else { | 3196 | } else { |
| 2495 | nr_unlink++; | 3197 | nr_unlink++; |
| 2496 | } | 3198 | } |
| @@ -2709,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, | |||
| 2709 | struct btrfs_inode_item *item, | 3411 | struct btrfs_inode_item *item, |
| 2710 | struct inode *inode) | 3412 | struct inode *inode) |
| 2711 | { | 3413 | { |
| 2712 | btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); | 3414 | struct btrfs_map_token token; |
| 2713 | btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); | 3415 | |
| 2714 | btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); | 3416 | btrfs_init_map_token(&token); |
| 2715 | btrfs_set_inode_mode(leaf, item, inode->i_mode); | 3417 | |
| 2716 | btrfs_set_inode_nlink(leaf, item, inode->i_nlink); | 3418 | btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token); |
| 3419 | btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token); | ||
| 3420 | btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size, | ||
| 3421 | &token); | ||
| 3422 | btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token); | ||
| 3423 | btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token); | ||
| 2717 | 3424 | ||
| 2718 | btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), | 3425 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item), |
| 2719 | inode->i_atime.tv_sec); | 3426 | inode->i_atime.tv_sec, &token); |
| 2720 | btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), | 3427 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item), |
| 2721 | inode->i_atime.tv_nsec); | 3428 | inode->i_atime.tv_nsec, &token); |
| 2722 | 3429 | ||
| 2723 | btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), | 3430 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item), |
| 2724 | inode->i_mtime.tv_sec); | 3431 | inode->i_mtime.tv_sec, &token); |
| 2725 | btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), | 3432 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item), |
| 2726 | inode->i_mtime.tv_nsec); | 3433 | inode->i_mtime.tv_nsec, &token); |
| 2727 | 3434 | ||
| 2728 | btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), | 3435 | btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item), |
| 2729 | inode->i_ctime.tv_sec); | 3436 | inode->i_ctime.tv_sec, &token); |
| 2730 | btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), | 3437 | btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item), |
| 2731 | inode->i_ctime.tv_nsec); | 3438 | inode->i_ctime.tv_nsec, &token); |
| 2732 | 3439 | ||
| 2733 | btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); | 3440 | btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode), |
| 2734 | btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); | 3441 | &token); |
| 2735 | btrfs_set_inode_sequence(leaf, item, inode->i_version); | 3442 | btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation, |
| 2736 | btrfs_set_inode_transid(leaf, item, trans->transid); | 3443 | &token); |
| 2737 | btrfs_set_inode_rdev(leaf, item, inode->i_rdev); | 3444 | btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token); |
| 2738 | btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); | 3445 | btrfs_set_token_inode_transid(leaf, item, trans->transid, &token); |
| 2739 | btrfs_set_inode_block_group(leaf, item, 0); | 3446 | btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token); |
| 3447 | btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token); | ||
| 3448 | btrfs_set_token_inode_block_group(leaf, item, 0, &token); | ||
| 2740 | } | 3449 | } |
| 2741 | 3450 | ||
| 2742 | /* | 3451 | /* |
| @@ -3304,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
| 3304 | u64 extent_num_bytes = 0; | 4013 | u64 extent_num_bytes = 0; |
| 3305 | u64 extent_offset = 0; | 4014 | u64 extent_offset = 0; |
| 3306 | u64 item_end = 0; | 4015 | u64 item_end = 0; |
| 3307 | u64 mask = root->sectorsize - 1; | ||
| 3308 | u32 found_type = (u8)-1; | 4016 | u32 found_type = (u8)-1; |
| 3309 | int found_extent; | 4017 | int found_extent; |
| 3310 | int del_item; | 4018 | int del_item; |
| @@ -3328,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, | |||
| 3328 | * extent just the way it is. | 4036 | * extent just the way it is. |
| 3329 | */ | 4037 | */ |
| 3330 | if (root->ref_cows || root == root->fs_info->tree_root) | 4038 | if (root->ref_cows || root == root->fs_info->tree_root) |
| 3331 | btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0); | 4039 | btrfs_drop_extent_cache(inode, ALIGN(new_size, |
| 4040 | root->sectorsize), (u64)-1, 0); | ||
| 3332 | 4041 | ||
| 3333 | /* | 4042 | /* |
| 3334 | * This function is also used to drop the items in the log tree before | 4043 | * This function is also used to drop the items in the log tree before |
| @@ -3407,10 +4116,9 @@ search_again: | |||
| 3407 | if (!del_item) { | 4116 | if (!del_item) { |
| 3408 | u64 orig_num_bytes = | 4117 | u64 orig_num_bytes = |
| 3409 | btrfs_file_extent_num_bytes(leaf, fi); | 4118 | btrfs_file_extent_num_bytes(leaf, fi); |
| 3410 | extent_num_bytes = new_size - | 4119 | extent_num_bytes = ALIGN(new_size - |
| 3411 | found_key.offset + root->sectorsize - 1; | 4120 | found_key.offset, |
| 3412 | extent_num_bytes = extent_num_bytes & | 4121 | root->sectorsize); |
| 3413 | ~((u64)root->sectorsize - 1); | ||
| 3414 | btrfs_set_file_extent_num_bytes(leaf, fi, | 4122 | btrfs_set_file_extent_num_bytes(leaf, fi, |
| 3415 | extent_num_bytes); | 4123 | extent_num_bytes); |
| 3416 | num_dec = (orig_num_bytes - | 4124 | num_dec = (orig_num_bytes - |
| @@ -3646,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
| 3646 | struct extent_map *em = NULL; | 4354 | struct extent_map *em = NULL; |
| 3647 | struct extent_state *cached_state = NULL; | 4355 | struct extent_state *cached_state = NULL; |
| 3648 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | 4356 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; |
| 3649 | u64 mask = root->sectorsize - 1; | 4357 | u64 hole_start = ALIGN(oldsize, root->sectorsize); |
| 3650 | u64 hole_start = (oldsize + mask) & ~mask; | 4358 | u64 block_end = ALIGN(size, root->sectorsize); |
| 3651 | u64 block_end = (size + mask) & ~mask; | ||
| 3652 | u64 last_byte; | 4359 | u64 last_byte; |
| 3653 | u64 cur_offset; | 4360 | u64 cur_offset; |
| 3654 | u64 hole_size; | 4361 | u64 hole_size; |
| @@ -3681,7 +4388,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) | |||
| 3681 | break; | 4388 | break; |
| 3682 | } | 4389 | } |
| 3683 | last_byte = min(extent_map_end(em), block_end); | 4390 | last_byte = min(extent_map_end(em), block_end); |
| 3684 | last_byte = (last_byte + mask) & ~mask; | 4391 | last_byte = ALIGN(last_byte , root->sectorsize); |
| 3685 | if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { | 4392 | if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { |
| 3686 | struct extent_map *hole_em; | 4393 | struct extent_map *hole_em; |
| 3687 | hole_size = last_byte - cur_offset; | 4394 | hole_size = last_byte - cur_offset; |
| @@ -3832,6 +4539,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) | |||
| 3832 | 4539 | ||
| 3833 | /* we don't support swapfiles, so vmtruncate shouldn't fail */ | 4540 | /* we don't support swapfiles, so vmtruncate shouldn't fail */ |
| 3834 | truncate_setsize(inode, newsize); | 4541 | truncate_setsize(inode, newsize); |
| 4542 | |||
| 4543 | /* Disable nonlocked read DIO to avoid the end less truncate */ | ||
| 4544 | btrfs_inode_block_unlocked_dio(inode); | ||
| 4545 | inode_dio_wait(inode); | ||
| 4546 | btrfs_inode_resume_unlocked_dio(inode); | ||
| 4547 | |||
| 3835 | ret = btrfs_truncate(inode); | 4548 | ret = btrfs_truncate(inode); |
| 3836 | if (ret && inode->i_nlink) | 4549 | if (ret && inode->i_nlink) |
| 3837 | btrfs_orphan_del(NULL, inode); | 4550 | btrfs_orphan_del(NULL, inode); |
| @@ -3904,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode) | |||
| 3904 | goto no_delete; | 4617 | goto no_delete; |
| 3905 | } | 4618 | } |
| 3906 | 4619 | ||
| 4620 | ret = btrfs_commit_inode_delayed_inode(inode); | ||
| 4621 | if (ret) { | ||
| 4622 | btrfs_orphan_del(NULL, inode); | ||
| 4623 | goto no_delete; | ||
| 4624 | } | ||
| 4625 | |||
| 3907 | rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); | 4626 | rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP); |
| 3908 | if (!rsv) { | 4627 | if (!rsv) { |
| 3909 | btrfs_orphan_del(NULL, inode); | 4628 | btrfs_orphan_del(NULL, inode); |
| @@ -3941,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode) | |||
| 3941 | goto no_delete; | 4660 | goto no_delete; |
| 3942 | } | 4661 | } |
| 3943 | 4662 | ||
| 3944 | trans = btrfs_start_transaction_lflush(root, 1); | 4663 | trans = btrfs_join_transaction(root); |
| 3945 | if (IS_ERR(trans)) { | 4664 | if (IS_ERR(trans)) { |
| 3946 | btrfs_orphan_del(NULL, inode); | 4665 | btrfs_orphan_del(NULL, inode); |
| 3947 | btrfs_free_block_rsv(root, rsv); | 4666 | btrfs_free_block_rsv(root, rsv); |
| @@ -3955,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode) | |||
| 3955 | break; | 4674 | break; |
| 3956 | 4675 | ||
| 3957 | trans->block_rsv = &root->fs_info->trans_block_rsv; | 4676 | trans->block_rsv = &root->fs_info->trans_block_rsv; |
| 3958 | ret = btrfs_update_inode(trans, root, inode); | ||
| 3959 | BUG_ON(ret); | ||
| 3960 | |||
| 3961 | btrfs_end_transaction(trans, root); | 4677 | btrfs_end_transaction(trans, root); |
| 3962 | trans = NULL; | 4678 | trans = NULL; |
| 3963 | btrfs_btree_balance_dirty(root); | 4679 | btrfs_btree_balance_dirty(root); |
| @@ -4854,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, | |||
| 4854 | if (btrfs_test_opt(root, NODATASUM)) | 5570 | if (btrfs_test_opt(root, NODATASUM)) |
| 4855 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; | 5571 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; |
| 4856 | if (btrfs_test_opt(root, NODATACOW)) | 5572 | if (btrfs_test_opt(root, NODATACOW)) |
| 4857 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; | 5573 | BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | |
| 5574 | BTRFS_INODE_NODATASUM; | ||
| 4858 | } | 5575 | } |
| 4859 | 5576 | ||
| 4860 | insert_inode_hash(inode); | 5577 | insert_inode_hash(inode); |
| @@ -5006,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, | |||
| 5006 | goto out_unlock; | 5723 | goto out_unlock; |
| 5007 | } | 5724 | } |
| 5008 | 5725 | ||
| 5009 | err = btrfs_update_inode(trans, root, inode); | ||
| 5010 | if (err) { | ||
| 5011 | drop_inode = 1; | ||
| 5012 | goto out_unlock; | ||
| 5013 | } | ||
| 5014 | |||
| 5015 | /* | 5726 | /* |
| 5016 | * If the active LSM wants to access the inode during | 5727 | * If the active LSM wants to access the inode during |
| 5017 | * d_instantiate it needs these. Smack checks to see | 5728 | * d_instantiate it needs these. Smack checks to see |
| @@ -5396,8 +6107,7 @@ again: | |||
| 5396 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | 6107 | } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { |
| 5397 | size_t size; | 6108 | size_t size; |
| 5398 | size = btrfs_file_extent_inline_len(leaf, item); | 6109 | size = btrfs_file_extent_inline_len(leaf, item); |
| 5399 | extent_end = (extent_start + size + root->sectorsize - 1) & | 6110 | extent_end = ALIGN(extent_start + size, root->sectorsize); |
| 5400 | ~((u64)root->sectorsize - 1); | ||
| 5401 | } | 6111 | } |
| 5402 | 6112 | ||
| 5403 | if (start >= extent_end) { | 6113 | if (start >= extent_end) { |
| @@ -5469,8 +6179,7 @@ again: | |||
| 5469 | copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, | 6179 | copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, |
| 5470 | size - extent_offset); | 6180 | size - extent_offset); |
| 5471 | em->start = extent_start + extent_offset; | 6181 | em->start = extent_start + extent_offset; |
| 5472 | em->len = (copy_size + root->sectorsize - 1) & | 6182 | em->len = ALIGN(copy_size, root->sectorsize); |
| 5473 | ~((u64)root->sectorsize - 1); | ||
| 5474 | em->orig_block_len = em->len; | 6183 | em->orig_block_len = em->len; |
| 5475 | em->orig_start = em->start; | 6184 | em->orig_start = em->start; |
| 5476 | if (compress_type) { | 6185 | if (compress_type) { |
| @@ -5949,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, | |||
| 5949 | 6658 | ||
| 5950 | em->start = start; | 6659 | em->start = start; |
| 5951 | em->orig_start = orig_start; | 6660 | em->orig_start = orig_start; |
| 6661 | em->mod_start = start; | ||
| 6662 | em->mod_len = len; | ||
| 5952 | em->len = len; | 6663 | em->len = len; |
| 5953 | em->block_len = block_len; | 6664 | em->block_len = block_len; |
| 5954 | em->block_start = block_start; | 6665 | em->block_start = block_start; |
| @@ -5990,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
| 5990 | u64 len = bh_result->b_size; | 6701 | u64 len = bh_result->b_size; |
| 5991 | struct btrfs_trans_handle *trans; | 6702 | struct btrfs_trans_handle *trans; |
| 5992 | int unlock_bits = EXTENT_LOCKED; | 6703 | int unlock_bits = EXTENT_LOCKED; |
| 5993 | int ret; | 6704 | int ret = 0; |
| 5994 | 6705 | ||
| 5995 | if (create) { | 6706 | if (create) |
| 5996 | ret = btrfs_delalloc_reserve_space(inode, len); | ||
| 5997 | if (ret) | ||
| 5998 | return ret; | ||
| 5999 | unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; | 6707 | unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY; |
| 6000 | } else { | 6708 | else |
| 6001 | len = min_t(u64, len, root->sectorsize); | 6709 | len = min_t(u64, len, root->sectorsize); |
| 6002 | } | ||
| 6003 | 6710 | ||
| 6004 | lockstart = start; | 6711 | lockstart = start; |
| 6005 | lockend = start + len - 1; | 6712 | lockend = start + len - 1; |
| @@ -6011,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
| 6011 | if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) | 6718 | if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create)) |
| 6012 | return -ENOTBLK; | 6719 | return -ENOTBLK; |
| 6013 | 6720 | ||
| 6014 | if (create) { | ||
| 6015 | ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, | ||
| 6016 | lockend, EXTENT_DELALLOC, NULL, | ||
| 6017 | &cached_state, GFP_NOFS); | ||
| 6018 | if (ret) | ||
| 6019 | goto unlock_err; | ||
| 6020 | } | ||
| 6021 | |||
| 6022 | em = btrfs_get_extent(inode, NULL, 0, start, len, 0); | 6721 | em = btrfs_get_extent(inode, NULL, 0, start, len, 0); |
| 6023 | if (IS_ERR(em)) { | 6722 | if (IS_ERR(em)) { |
| 6024 | ret = PTR_ERR(em); | 6723 | ret = PTR_ERR(em); |
| @@ -6050,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, | |||
| 6050 | if (!create && (em->block_start == EXTENT_MAP_HOLE || | 6749 | if (!create && (em->block_start == EXTENT_MAP_HOLE || |
| 6051 | test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { | 6750 | test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { |
| 6052 | free_extent_map(em); | 6751 | free_extent_map(em); |
| 6053 | ret = 0; | ||
| 6054 | goto unlock_err; | 6752 | goto unlock_err; |
| 6055 | } | 6753 | } |
| 6056 | 6754 | ||
| @@ -6148,6 +6846,15 @@ unlock: | |||
| 6148 | */ | 6846 | */ |
| 6149 | if (start + len > i_size_read(inode)) | 6847 | if (start + len > i_size_read(inode)) |
| 6150 | i_size_write(inode, start + len); | 6848 | i_size_write(inode, start + len); |
| 6849 | |||
| 6850 | spin_lock(&BTRFS_I(inode)->lock); | ||
| 6851 | BTRFS_I(inode)->outstanding_extents++; | ||
| 6852 | spin_unlock(&BTRFS_I(inode)->lock); | ||
| 6853 | |||
| 6854 | ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, | ||
| 6855 | lockstart + len - 1, EXTENT_DELALLOC, NULL, | ||
| 6856 | &cached_state, GFP_NOFS); | ||
| 6857 | BUG_ON(ret); | ||
| 6151 | } | 6858 | } |
| 6152 | 6859 | ||
| 6153 | /* | 6860 | /* |
| @@ -6156,24 +6863,9 @@ unlock: | |||
| 6156 | * aren't using if there is any left over space. | 6863 | * aren't using if there is any left over space. |
| 6157 | */ | 6864 | */ |
| 6158 | if (lockstart < lockend) { | 6865 | if (lockstart < lockend) { |
| 6159 | if (create && len < lockend - lockstart) { | 6866 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, |
| 6160 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, | 6867 | lockend, unlock_bits, 1, 0, |
| 6161 | lockstart + len - 1, | 6868 | &cached_state, GFP_NOFS); |
| 6162 | unlock_bits | EXTENT_DEFRAG, 1, 0, | ||
| 6163 | &cached_state, GFP_NOFS); | ||
| 6164 | /* | ||
| 6165 | * Beside unlock, we also need to cleanup reserved space | ||
| 6166 | * for the left range by attaching EXTENT_DO_ACCOUNTING. | ||
| 6167 | */ | ||
| 6168 | clear_extent_bit(&BTRFS_I(inode)->io_tree, | ||
| 6169 | lockstart + len, lockend, | ||
| 6170 | unlock_bits | EXTENT_DO_ACCOUNTING | | ||
| 6171 | EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS); | ||
| 6172 | } else { | ||
| 6173 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, | ||
| 6174 | lockend, unlock_bits, 1, 0, | ||
| 6175 | &cached_state, GFP_NOFS); | ||
| 6176 | } | ||
| 6177 | } else { | 6869 | } else { |
| 6178 | free_extent_state(cached_state); | 6870 | free_extent_state(cached_state); |
| 6179 | } | 6871 | } |
| @@ -6183,9 +6875,6 @@ unlock: | |||
| 6183 | return 0; | 6875 | return 0; |
| 6184 | 6876 | ||
| 6185 | unlock_err: | 6877 | unlock_err: |
| 6186 | if (create) | ||
| 6187 | unlock_bits |= EXTENT_DO_ACCOUNTING; | ||
| 6188 | |||
| 6189 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, | 6878 | clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, |
| 6190 | unlock_bits, 1, 0, &cached_state, GFP_NOFS); | 6879 | unlock_bits, 1, 0, &cached_state, GFP_NOFS); |
| 6191 | return ret; | 6880 | return ret; |
| @@ -6426,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
| 6426 | int async_submit = 0; | 7115 | int async_submit = 0; |
| 6427 | 7116 | ||
| 6428 | map_length = orig_bio->bi_size; | 7117 | map_length = orig_bio->bi_size; |
| 6429 | ret = btrfs_map_block(root->fs_info, READ, start_sector << 9, | 7118 | ret = btrfs_map_block(root->fs_info, rw, start_sector << 9, |
| 6430 | &map_length, NULL, 0); | 7119 | &map_length, NULL, 0); |
| 6431 | if (ret) { | 7120 | if (ret) { |
| 6432 | bio_put(orig_bio); | 7121 | bio_put(orig_bio); |
| 6433 | return -EIO; | 7122 | return -EIO; |
| 6434 | } | 7123 | } |
| 6435 | |||
| 6436 | if (map_length >= orig_bio->bi_size) { | 7124 | if (map_length >= orig_bio->bi_size) { |
| 6437 | bio = orig_bio; | 7125 | bio = orig_bio; |
| 6438 | goto submit; | 7126 | goto submit; |
| 6439 | } | 7127 | } |
| 6440 | 7128 | ||
| 6441 | async_submit = 1; | 7129 | /* async crcs make it difficult to collect full stripe writes. */ |
| 7130 | if (btrfs_get_alloc_profile(root, 1) & | ||
| 7131 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) | ||
| 7132 | async_submit = 0; | ||
| 7133 | else | ||
| 7134 | async_submit = 1; | ||
| 7135 | |||
| 6442 | bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); | 7136 | bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS); |
| 6443 | if (!bio) | 7137 | if (!bio) |
| 6444 | return -ENOMEM; | 7138 | return -ENOMEM; |
| @@ -6480,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, | |||
| 6480 | bio->bi_end_io = btrfs_end_dio_bio; | 7174 | bio->bi_end_io = btrfs_end_dio_bio; |
| 6481 | 7175 | ||
| 6482 | map_length = orig_bio->bi_size; | 7176 | map_length = orig_bio->bi_size; |
| 6483 | ret = btrfs_map_block(root->fs_info, READ, | 7177 | ret = btrfs_map_block(root->fs_info, rw, |
| 6484 | start_sector << 9, | 7178 | start_sector << 9, |
| 6485 | &map_length, NULL, 0); | 7179 | &map_length, NULL, 0); |
| 6486 | if (ret) { | 7180 | if (ret) { |
| @@ -6623,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, | |||
| 6623 | { | 7317 | { |
| 6624 | struct file *file = iocb->ki_filp; | 7318 | struct file *file = iocb->ki_filp; |
| 6625 | struct inode *inode = file->f_mapping->host; | 7319 | struct inode *inode = file->f_mapping->host; |
| 7320 | size_t count = 0; | ||
| 7321 | int flags = 0; | ||
| 7322 | bool wakeup = true; | ||
| 7323 | bool relock = false; | ||
| 7324 | ssize_t ret; | ||
| 6626 | 7325 | ||
| 6627 | if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, | 7326 | if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov, |
| 6628 | offset, nr_segs)) | 7327 | offset, nr_segs)) |
| 6629 | return 0; | 7328 | return 0; |
| 6630 | 7329 | ||
| 6631 | return __blockdev_direct_IO(rw, iocb, inode, | 7330 | atomic_inc(&inode->i_dio_count); |
| 6632 | BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, | 7331 | smp_mb__after_atomic_inc(); |
| 6633 | iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, | 7332 | |
| 6634 | btrfs_submit_direct, 0); | 7333 | if (rw & WRITE) { |
| 7334 | count = iov_length(iov, nr_segs); | ||
| 7335 | /* | ||
| 7336 | * If the write DIO is beyond the EOF, we need update | ||
| 7337 | * the isize, but it is protected by i_mutex. So we can | ||
| 7338 | * not unlock the i_mutex at this case. | ||
| 7339 | */ | ||
| 7340 | if (offset + count <= inode->i_size) { | ||
| 7341 | mutex_unlock(&inode->i_mutex); | ||
| 7342 | relock = true; | ||
| 7343 | } | ||
| 7344 | ret = btrfs_delalloc_reserve_space(inode, count); | ||
| 7345 | if (ret) | ||
| 7346 | goto out; | ||
| 7347 | } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, | ||
| 7348 | &BTRFS_I(inode)->runtime_flags))) { | ||
| 7349 | inode_dio_done(inode); | ||
| 7350 | flags = DIO_LOCKING | DIO_SKIP_HOLES; | ||
| 7351 | wakeup = false; | ||
| 7352 | } | ||
| 7353 | |||
| 7354 | ret = __blockdev_direct_IO(rw, iocb, inode, | ||
| 7355 | BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev, | ||
| 7356 | iov, offset, nr_segs, btrfs_get_blocks_direct, NULL, | ||
| 7357 | btrfs_submit_direct, flags); | ||
| 7358 | if (rw & WRITE) { | ||
| 7359 | if (ret < 0 && ret != -EIOCBQUEUED) | ||
| 7360 | btrfs_delalloc_release_space(inode, count); | ||
| 7361 | else if (ret >= 0 && (size_t)ret < count) | ||
| 7362 | btrfs_delalloc_release_space(inode, | ||
| 7363 | count - (size_t)ret); | ||
| 7364 | else | ||
| 7365 | btrfs_delalloc_release_metadata(inode, 0); | ||
| 7366 | } | ||
| 7367 | out: | ||
| 7368 | if (wakeup) | ||
| 7369 | inode_dio_done(inode); | ||
| 7370 | if (relock) | ||
| 7371 | mutex_lock(&inode->i_mutex); | ||
| 7372 | |||
| 7373 | return ret; | ||
| 6635 | } | 7374 | } |
| 6636 | 7375 | ||
| 6637 | #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) | 7376 | #define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) |
| @@ -6735,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset) | |||
| 6735 | return; | 7474 | return; |
| 6736 | } | 7475 | } |
| 6737 | lock_extent_bits(tree, page_start, page_end, 0, &cached_state); | 7476 | lock_extent_bits(tree, page_start, page_end, 0, &cached_state); |
| 6738 | ordered = btrfs_lookup_ordered_extent(inode, | 7477 | ordered = btrfs_lookup_ordered_extent(inode, page_offset(page)); |
| 6739 | page_offset(page)); | ||
| 6740 | if (ordered) { | 7478 | if (ordered) { |
| 6741 | /* | 7479 | /* |
| 6742 | * IO on this page will never be started, so we need | 7480 | * IO on this page will never be started, so we need |
| @@ -7216,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode) | |||
| 7216 | { | 7954 | { |
| 7217 | struct btrfs_root *root = BTRFS_I(inode)->root; | 7955 | struct btrfs_root *root = BTRFS_I(inode)->root; |
| 7218 | 7956 | ||
| 7957 | /* the snap/subvol tree is on deleting */ | ||
| 7219 | if (btrfs_root_refs(&root->root_item) == 0 && | 7958 | if (btrfs_root_refs(&root->root_item) == 0 && |
| 7220 | !btrfs_is_free_space_inode(inode)) | 7959 | root != root->fs_info->tree_root) |
| 7221 | return 1; | 7960 | return 1; |
| 7222 | else | 7961 | else |
| 7223 | return generic_drop_inode(inode); | 7962 | return generic_drop_inode(inode); |
| @@ -7299,40 +8038,22 @@ fail: | |||
| 7299 | static int btrfs_getattr(struct vfsmount *mnt, | 8038 | static int btrfs_getattr(struct vfsmount *mnt, |
| 7300 | struct dentry *dentry, struct kstat *stat) | 8039 | struct dentry *dentry, struct kstat *stat) |
| 7301 | { | 8040 | { |
| 8041 | u64 delalloc_bytes; | ||
| 7302 | struct inode *inode = dentry->d_inode; | 8042 | struct inode *inode = dentry->d_inode; |
| 7303 | u32 blocksize = inode->i_sb->s_blocksize; | 8043 | u32 blocksize = inode->i_sb->s_blocksize; |
| 7304 | 8044 | ||
| 7305 | generic_fillattr(inode, stat); | 8045 | generic_fillattr(inode, stat); |
| 7306 | stat->dev = BTRFS_I(inode)->root->anon_dev; | 8046 | stat->dev = BTRFS_I(inode)->root->anon_dev; |
| 7307 | stat->blksize = PAGE_CACHE_SIZE; | 8047 | stat->blksize = PAGE_CACHE_SIZE; |
| 8048 | |||
| 8049 | spin_lock(&BTRFS_I(inode)->lock); | ||
| 8050 | delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; | ||
| 8051 | spin_unlock(&BTRFS_I(inode)->lock); | ||
| 7308 | stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + | 8052 | stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + |
| 7309 | ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9; | 8053 | ALIGN(delalloc_bytes, blocksize)) >> 9; |
| 7310 | return 0; | 8054 | return 0; |
| 7311 | } | 8055 | } |
| 7312 | 8056 | ||
| 7313 | /* | ||
| 7314 | * If a file is moved, it will inherit the cow and compression flags of the new | ||
| 7315 | * directory. | ||
| 7316 | */ | ||
| 7317 | static void fixup_inode_flags(struct inode *dir, struct inode *inode) | ||
| 7318 | { | ||
| 7319 | struct btrfs_inode *b_dir = BTRFS_I(dir); | ||
| 7320 | struct btrfs_inode *b_inode = BTRFS_I(inode); | ||
| 7321 | |||
| 7322 | if (b_dir->flags & BTRFS_INODE_NODATACOW) | ||
| 7323 | b_inode->flags |= BTRFS_INODE_NODATACOW; | ||
| 7324 | else | ||
| 7325 | b_inode->flags &= ~BTRFS_INODE_NODATACOW; | ||
| 7326 | |||
| 7327 | if (b_dir->flags & BTRFS_INODE_COMPRESS) { | ||
| 7328 | b_inode->flags |= BTRFS_INODE_COMPRESS; | ||
| 7329 | b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS; | ||
| 7330 | } else { | ||
| 7331 | b_inode->flags &= ~(BTRFS_INODE_COMPRESS | | ||
| 7332 | BTRFS_INODE_NOCOMPRESS); | ||
| 7333 | } | ||
| 7334 | } | ||
| 7335 | |||
| 7336 | static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | 8057 | static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, |
| 7337 | struct inode *new_dir, struct dentry *new_dentry) | 8058 | struct inode *new_dir, struct dentry *new_dentry) |
| 7338 | { | 8059 | { |
| @@ -7498,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, | |||
| 7498 | } | 8219 | } |
| 7499 | } | 8220 | } |
| 7500 | 8221 | ||
| 7501 | fixup_inode_flags(new_dir, old_inode); | ||
| 7502 | |||
| 7503 | ret = btrfs_add_link(trans, new_dir, old_inode, | 8222 | ret = btrfs_add_link(trans, new_dir, old_inode, |
| 7504 | new_dentry->d_name.name, | 8223 | new_dentry->d_name.name, |
| 7505 | new_dentry->d_name.len, 0, index); | 8224 | new_dentry->d_name.len, 0, index); |
| @@ -7583,7 +8302,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) | |||
| 7583 | 8302 | ||
| 7584 | INIT_LIST_HEAD(&works); | 8303 | INIT_LIST_HEAD(&works); |
| 7585 | INIT_LIST_HEAD(&splice); | 8304 | INIT_LIST_HEAD(&splice); |
| 7586 | again: | 8305 | |
| 7587 | spin_lock(&root->fs_info->delalloc_lock); | 8306 | spin_lock(&root->fs_info->delalloc_lock); |
| 7588 | list_splice_init(&root->fs_info->delalloc_inodes, &splice); | 8307 | list_splice_init(&root->fs_info->delalloc_inodes, &splice); |
| 7589 | while (!list_empty(&splice)) { | 8308 | while (!list_empty(&splice)) { |
| @@ -7593,8 +8312,11 @@ again: | |||
| 7593 | list_del_init(&binode->delalloc_inodes); | 8312 | list_del_init(&binode->delalloc_inodes); |
| 7594 | 8313 | ||
| 7595 | inode = igrab(&binode->vfs_inode); | 8314 | inode = igrab(&binode->vfs_inode); |
| 7596 | if (!inode) | 8315 | if (!inode) { |
| 8316 | clear_bit(BTRFS_INODE_IN_DELALLOC_LIST, | ||
| 8317 | &binode->runtime_flags); | ||
| 7597 | continue; | 8318 | continue; |
| 8319 | } | ||
| 7598 | 8320 | ||
| 7599 | list_add_tail(&binode->delalloc_inodes, | 8321 | list_add_tail(&binode->delalloc_inodes, |
| 7600 | &root->fs_info->delalloc_inodes); | 8322 | &root->fs_info->delalloc_inodes); |
| @@ -7619,13 +8341,6 @@ again: | |||
| 7619 | btrfs_wait_and_free_delalloc_work(work); | 8341 | btrfs_wait_and_free_delalloc_work(work); |
| 7620 | } | 8342 | } |
| 7621 | 8343 | ||
| 7622 | spin_lock(&root->fs_info->delalloc_lock); | ||
| 7623 | if (!list_empty(&root->fs_info->delalloc_inodes)) { | ||
| 7624 | spin_unlock(&root->fs_info->delalloc_lock); | ||
| 7625 | goto again; | ||
| 7626 | } | ||
| 7627 | spin_unlock(&root->fs_info->delalloc_lock); | ||
| 7628 | |||
| 7629 | /* the filemap_flush will queue IO into the worker threads, but | 8344 | /* the filemap_flush will queue IO into the worker threads, but |
| 7630 | * we have to make sure the IO is actually started and that | 8345 | * we have to make sure the IO is actually started and that |
| 7631 | * ordered extents get created before we return | 8346 | * ordered extents get created before we return |
| @@ -7801,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, | |||
| 7801 | } | 8516 | } |
| 7802 | } | 8517 | } |
| 7803 | 8518 | ||
| 7804 | ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, | 8519 | ret = btrfs_reserve_extent(trans, root, |
| 7805 | 0, *alloc_hint, &ins, 1); | 8520 | min(num_bytes, 256ULL * 1024 * 1024), |
| 8521 | min_size, 0, *alloc_hint, &ins, 1); | ||
| 7806 | if (ret) { | 8522 | if (ret) { |
| 7807 | if (own_trans) | 8523 | if (own_trans) |
| 7808 | btrfs_end_transaction(trans, root); | 8524 | btrfs_end_transaction(trans, root); |
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c3f09f71bedd..c83086fdda05 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c | |||
| @@ -42,12 +42,12 @@ | |||
| 42 | #include <linux/slab.h> | 42 | #include <linux/slab.h> |
| 43 | #include <linux/blkdev.h> | 43 | #include <linux/blkdev.h> |
| 44 | #include <linux/uuid.h> | 44 | #include <linux/uuid.h> |
| 45 | #include <linux/btrfs.h> | ||
| 45 | #include "compat.h" | 46 | #include "compat.h" |
| 46 | #include "ctree.h" | 47 | #include "ctree.h" |
| 47 | #include "disk-io.h" | 48 | #include "disk-io.h" |
| 48 | #include "transaction.h" | 49 | #include "transaction.h" |
| 49 | #include "btrfs_inode.h" | 50 | #include "btrfs_inode.h" |
| 50 | #include "ioctl.h" | ||
| 51 | #include "print-tree.h" | 51 | #include "print-tree.h" |
| 52 | #include "volumes.h" | 52 | #include "volumes.h" |
| 53 | #include "locking.h" | 53 | #include "locking.h" |
| @@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) | |||
| 363 | return 0; | 363 | return 0; |
| 364 | } | 364 | } |
| 365 | 365 | ||
| 366 | static noinline int create_subvol(struct btrfs_root *root, | 366 | static noinline int create_subvol(struct inode *dir, |
| 367 | struct dentry *dentry, | 367 | struct dentry *dentry, |
| 368 | char *name, int namelen, | 368 | char *name, int namelen, |
| 369 | u64 *async_transid, | 369 | u64 *async_transid, |
| 370 | struct btrfs_qgroup_inherit **inherit) | 370 | struct btrfs_qgroup_inherit *inherit) |
| 371 | { | 371 | { |
| 372 | struct btrfs_trans_handle *trans; | 372 | struct btrfs_trans_handle *trans; |
| 373 | struct btrfs_key key; | 373 | struct btrfs_key key; |
| 374 | struct btrfs_root_item root_item; | 374 | struct btrfs_root_item root_item; |
| 375 | struct btrfs_inode_item *inode_item; | 375 | struct btrfs_inode_item *inode_item; |
| 376 | struct extent_buffer *leaf; | 376 | struct extent_buffer *leaf; |
| 377 | struct btrfs_root *root = BTRFS_I(dir)->root; | ||
| 377 | struct btrfs_root *new_root; | 378 | struct btrfs_root *new_root; |
| 378 | struct dentry *parent = dentry->d_parent; | 379 | struct btrfs_block_rsv block_rsv; |
| 379 | struct inode *dir; | ||
| 380 | struct timespec cur_time = CURRENT_TIME; | 380 | struct timespec cur_time = CURRENT_TIME; |
| 381 | int ret; | 381 | int ret; |
| 382 | int err; | 382 | int err; |
| 383 | u64 objectid; | 383 | u64 objectid; |
| 384 | u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; | 384 | u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; |
| 385 | u64 index = 0; | 385 | u64 index = 0; |
| 386 | u64 qgroup_reserved; | ||
| 386 | uuid_le new_uuid; | 387 | uuid_le new_uuid; |
| 387 | 388 | ||
| 388 | ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); | 389 | ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); |
| 389 | if (ret) | 390 | if (ret) |
| 390 | return ret; | 391 | return ret; |
| 391 | 392 | ||
| 392 | dir = parent->d_inode; | 393 | btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); |
| 393 | |||
| 394 | /* | 394 | /* |
| 395 | * 1 - inode item | 395 | * The same as the snapshot creation, please see the comment |
| 396 | * 2 - refs | 396 | * of create_snapshot(). |
| 397 | * 1 - root item | ||
| 398 | * 2 - dir items | ||
| 399 | */ | 397 | */ |
| 400 | trans = btrfs_start_transaction(root, 6); | 398 | ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, |
| 401 | if (IS_ERR(trans)) | 399 | 7, &qgroup_reserved); |
| 402 | return PTR_ERR(trans); | 400 | if (ret) |
| 401 | return ret; | ||
| 402 | |||
| 403 | trans = btrfs_start_transaction(root, 0); | ||
| 404 | if (IS_ERR(trans)) { | ||
| 405 | ret = PTR_ERR(trans); | ||
| 406 | goto out; | ||
| 407 | } | ||
| 408 | trans->block_rsv = &block_rsv; | ||
| 409 | trans->bytes_reserved = block_rsv.size; | ||
| 403 | 410 | ||
| 404 | ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, | 411 | ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit); |
| 405 | inherit ? *inherit : NULL); | ||
| 406 | if (ret) | 412 | if (ret) |
| 407 | goto fail; | 413 | goto fail; |
| 408 | 414 | ||
| @@ -516,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root, | |||
| 516 | BUG_ON(ret); | 522 | BUG_ON(ret); |
| 517 | 523 | ||
| 518 | fail: | 524 | fail: |
| 525 | trans->block_rsv = NULL; | ||
| 526 | trans->bytes_reserved = 0; | ||
| 519 | if (async_transid) { | 527 | if (async_transid) { |
| 520 | *async_transid = trans->transid; | 528 | *async_transid = trans->transid; |
| 521 | err = btrfs_commit_transaction_async(trans, root, 1); | 529 | err = btrfs_commit_transaction_async(trans, root, 1); |
| @@ -527,13 +535,15 @@ fail: | |||
| 527 | 535 | ||
| 528 | if (!ret) | 536 | if (!ret) |
| 529 | d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); | 537 | d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); |
| 530 | 538 | out: | |
| 539 | btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); | ||
| 531 | return ret; | 540 | return ret; |
| 532 | } | 541 | } |
| 533 | 542 | ||
| 534 | static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, | 543 | static int create_snapshot(struct btrfs_root *root, struct inode *dir, |
| 535 | char *name, int namelen, u64 *async_transid, | 544 | struct dentry *dentry, char *name, int namelen, |
| 536 | bool readonly, struct btrfs_qgroup_inherit **inherit) | 545 | u64 *async_transid, bool readonly, |
| 546 | struct btrfs_qgroup_inherit *inherit) | ||
| 537 | { | 547 | { |
| 538 | struct inode *inode; | 548 | struct inode *inode; |
| 539 | struct btrfs_pending_snapshot *pending_snapshot; | 549 | struct btrfs_pending_snapshot *pending_snapshot; |
| @@ -549,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, | |||
| 549 | 559 | ||
| 550 | btrfs_init_block_rsv(&pending_snapshot->block_rsv, | 560 | btrfs_init_block_rsv(&pending_snapshot->block_rsv, |
| 551 | BTRFS_BLOCK_RSV_TEMP); | 561 | BTRFS_BLOCK_RSV_TEMP); |
| 562 | /* | ||
| 563 | * 1 - parent dir inode | ||
| 564 | * 2 - dir entries | ||
| 565 | * 1 - root item | ||
| 566 | * 2 - root ref/backref | ||
| 567 | * 1 - root of snapshot | ||
| 568 | */ | ||
| 569 | ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, | ||
| 570 | &pending_snapshot->block_rsv, 7, | ||
| 571 | &pending_snapshot->qgroup_reserved); | ||
| 572 | if (ret) | ||
| 573 | goto out; | ||
| 574 | |||
| 552 | pending_snapshot->dentry = dentry; | 575 | pending_snapshot->dentry = dentry; |
| 553 | pending_snapshot->root = root; | 576 | pending_snapshot->root = root; |
| 554 | pending_snapshot->readonly = readonly; | 577 | pending_snapshot->readonly = readonly; |
| 555 | if (inherit) { | 578 | pending_snapshot->dir = dir; |
| 556 | pending_snapshot->inherit = *inherit; | 579 | pending_snapshot->inherit = inherit; |
| 557 | *inherit = NULL; /* take responsibility to free it */ | ||
| 558 | } | ||
| 559 | 580 | ||
| 560 | trans = btrfs_start_transaction(root->fs_info->extent_root, 6); | 581 | trans = btrfs_start_transaction(root, 0); |
| 561 | if (IS_ERR(trans)) { | 582 | if (IS_ERR(trans)) { |
| 562 | ret = PTR_ERR(trans); | 583 | ret = PTR_ERR(trans); |
| 563 | goto fail; | 584 | goto fail; |
| 564 | } | 585 | } |
| 565 | 586 | ||
| 566 | ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); | ||
| 567 | BUG_ON(ret); | ||
| 568 | |||
| 569 | spin_lock(&root->fs_info->trans_lock); | 587 | spin_lock(&root->fs_info->trans_lock); |
| 570 | list_add(&pending_snapshot->list, | 588 | list_add(&pending_snapshot->list, |
| 571 | &trans->transaction->pending_snapshots); | 589 | &trans->transaction->pending_snapshots); |
| @@ -602,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, | |||
| 602 | d_instantiate(dentry, inode); | 620 | d_instantiate(dentry, inode); |
| 603 | ret = 0; | 621 | ret = 0; |
| 604 | fail: | 622 | fail: |
| 623 | btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, | ||
| 624 | &pending_snapshot->block_rsv, | ||
| 625 | pending_snapshot->qgroup_reserved); | ||
| 626 | out: | ||
| 605 | kfree(pending_snapshot); | 627 | kfree(pending_snapshot); |
| 606 | return ret; | 628 | return ret; |
| 607 | } | 629 | } |
| @@ -695,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent, | |||
| 695 | char *name, int namelen, | 717 | char *name, int namelen, |
| 696 | struct btrfs_root *snap_src, | 718 | struct btrfs_root *snap_src, |
| 697 | u64 *async_transid, bool readonly, | 719 | u64 *async_transid, bool readonly, |
| 698 | struct btrfs_qgroup_inherit **inherit) | 720 | struct btrfs_qgroup_inherit *inherit) |
| 699 | { | 721 | { |
| 700 | struct inode *dir = parent->dentry->d_inode; | 722 | struct inode *dir = parent->dentry->d_inode; |
| 701 | struct dentry *dentry; | 723 | struct dentry *dentry; |
| @@ -732,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent, | |||
| 732 | goto out_up_read; | 754 | goto out_up_read; |
| 733 | 755 | ||
| 734 | if (snap_src) { | 756 | if (snap_src) { |
| 735 | error = create_snapshot(snap_src, dentry, name, namelen, | 757 | error = create_snapshot(snap_src, dir, dentry, name, namelen, |
| 736 | async_transid, readonly, inherit); | 758 | async_transid, readonly, inherit); |
| 737 | } else { | 759 | } else { |
| 738 | error = create_subvol(BTRFS_I(dir)->root, dentry, | 760 | error = create_subvol(dir, dentry, name, namelen, |
| 739 | name, namelen, async_transid, inherit); | 761 | async_transid, inherit); |
| 740 | } | 762 | } |
| 741 | if (!error) | 763 | if (!error) |
| 742 | fsnotify_mkdir(dir, dentry); | 764 | fsnotify_mkdir(dir, dentry); |
| @@ -818,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root, | |||
| 818 | 840 | ||
| 819 | while(1) { | 841 | while(1) { |
| 820 | ret = btrfs_search_forward(root, &min_key, &max_key, | 842 | ret = btrfs_search_forward(root, &min_key, &max_key, |
| 821 | path, 0, newer_than); | 843 | path, newer_than); |
| 822 | if (ret != 0) | 844 | if (ret != 0) |
| 823 | goto none; | 845 | goto none; |
| 824 | if (min_key.objectid != ino) | 846 | if (min_key.objectid != ino) |
| @@ -1206,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, | |||
| 1206 | if (!(inode->i_sb->s_flags & MS_ACTIVE)) | 1228 | if (!(inode->i_sb->s_flags & MS_ACTIVE)) |
| 1207 | break; | 1229 | break; |
| 1208 | 1230 | ||
| 1231 | if (btrfs_defrag_cancelled(root->fs_info)) { | ||
| 1232 | printk(KERN_DEBUG "btrfs: defrag_file cancelled\n"); | ||
| 1233 | ret = -EAGAIN; | ||
| 1234 | break; | ||
| 1235 | } | ||
| 1236 | |||
| 1209 | if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, | 1237 | if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, |
| 1210 | extent_thresh, &last_len, &skip, | 1238 | extent_thresh, &last_len, &skip, |
| 1211 | &defrag_end, range->flags & | 1239 | &defrag_end, range->flags & |
| @@ -1329,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
| 1329 | int ret = 0; | 1357 | int ret = 0; |
| 1330 | int mod = 0; | 1358 | int mod = 0; |
| 1331 | 1359 | ||
| 1332 | if (root->fs_info->sb->s_flags & MS_RDONLY) | ||
| 1333 | return -EROFS; | ||
| 1334 | |||
| 1335 | if (!capable(CAP_SYS_ADMIN)) | 1360 | if (!capable(CAP_SYS_ADMIN)) |
| 1336 | return -EPERM; | 1361 | return -EPERM; |
| 1337 | 1362 | ||
| @@ -1363,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
| 1363 | *devstr = '\0'; | 1388 | *devstr = '\0'; |
| 1364 | devstr = vol_args->name; | 1389 | devstr = vol_args->name; |
| 1365 | devid = simple_strtoull(devstr, &end, 10); | 1390 | devid = simple_strtoull(devstr, &end, 10); |
| 1391 | if (!devid) { | ||
| 1392 | ret = -EINVAL; | ||
| 1393 | goto out_free; | ||
| 1394 | } | ||
| 1366 | printk(KERN_INFO "btrfs: resizing devid %llu\n", | 1395 | printk(KERN_INFO "btrfs: resizing devid %llu\n", |
| 1367 | (unsigned long long)devid); | 1396 | (unsigned long long)devid); |
| 1368 | } | 1397 | } |
| @@ -1371,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
| 1371 | if (!device) { | 1400 | if (!device) { |
| 1372 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", | 1401 | printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", |
| 1373 | (unsigned long long)devid); | 1402 | (unsigned long long)devid); |
| 1374 | ret = -EINVAL; | 1403 | ret = -ENODEV; |
| 1375 | goto out_free; | 1404 | goto out_free; |
| 1376 | } | 1405 | } |
| 1377 | 1406 | ||
| @@ -1379,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
| 1379 | printk(KERN_INFO "btrfs: resizer unable to apply on " | 1408 | printk(KERN_INFO "btrfs: resizer unable to apply on " |
| 1380 | "readonly device %llu\n", | 1409 | "readonly device %llu\n", |
| 1381 | (unsigned long long)devid); | 1410 | (unsigned long long)devid); |
| 1382 | ret = -EINVAL; | 1411 | ret = -EPERM; |
| 1383 | goto out_free; | 1412 | goto out_free; |
| 1384 | } | 1413 | } |
| 1385 | 1414 | ||
| @@ -1401,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, | |||
| 1401 | } | 1430 | } |
| 1402 | 1431 | ||
| 1403 | if (device->is_tgtdev_for_dev_replace) { | 1432 | if (device->is_tgtdev_for_dev_replace) { |
| 1404 | ret = -EINVAL; | 1433 | ret = -EPERM; |
| 1405 | goto out_free; | 1434 | goto out_free; |
| 1406 | } | 1435 | } |
| 1407 | 1436 | ||
| @@ -1457,7 +1486,7 @@ out: | |||
| 1457 | static noinline int btrfs_ioctl_snap_create_transid(struct file *file, | 1486 | static noinline int btrfs_ioctl_snap_create_transid(struct file *file, |
| 1458 | char *name, unsigned long fd, int subvol, | 1487 | char *name, unsigned long fd, int subvol, |
| 1459 | u64 *transid, bool readonly, | 1488 | u64 *transid, bool readonly, |
| 1460 | struct btrfs_qgroup_inherit **inherit) | 1489 | struct btrfs_qgroup_inherit *inherit) |
| 1461 | { | 1490 | { |
| 1462 | int namelen; | 1491 | int namelen; |
| 1463 | int ret = 0; | 1492 | int ret = 0; |
| @@ -1566,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, | |||
| 1566 | 1595 | ||
| 1567 | ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, | 1596 | ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, |
| 1568 | vol_args->fd, subvol, ptr, | 1597 | vol_args->fd, subvol, ptr, |
| 1569 | readonly, &inherit); | 1598 | readonly, inherit); |
| 1570 | 1599 | ||
| 1571 | if (ret == 0 && ptr && | 1600 | if (ret == 0 && ptr && |
| 1572 | copy_to_user(arg + | 1601 | copy_to_user(arg + |
| @@ -1863,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode, | |||
| 1863 | path->keep_locks = 1; | 1892 | path->keep_locks = 1; |
| 1864 | 1893 | ||
| 1865 | while(1) { | 1894 | while(1) { |
| 1866 | ret = btrfs_search_forward(root, &key, &max_key, path, 0, | 1895 | ret = btrfs_search_forward(root, &key, &max_key, path, |
| 1867 | sk->min_transid); | 1896 | sk->min_transid); |
| 1868 | if (ret != 0) { | 1897 | if (ret != 0) { |
| 1869 | if (ret > 0) | 1898 | if (ret > 0) |
| @@ -2035,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, | |||
| 2035 | struct btrfs_root *dest = NULL; | 2064 | struct btrfs_root *dest = NULL; |
| 2036 | struct btrfs_ioctl_vol_args *vol_args; | 2065 | struct btrfs_ioctl_vol_args *vol_args; |
| 2037 | struct btrfs_trans_handle *trans; | 2066 | struct btrfs_trans_handle *trans; |
| 2067 | struct btrfs_block_rsv block_rsv; | ||
| 2068 | u64 qgroup_reserved; | ||
| 2038 | int namelen; | 2069 | int namelen; |
| 2039 | int ret; | 2070 | int ret; |
| 2040 | int err = 0; | 2071 | int err = 0; |
| @@ -2124,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, | |||
| 2124 | if (err) | 2155 | if (err) |
| 2125 | goto out_up_write; | 2156 | goto out_up_write; |
| 2126 | 2157 | ||
| 2158 | btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); | ||
| 2159 | /* | ||
| 2160 | * One for dir inode, two for dir entries, two for root | ||
| 2161 | * ref/backref. | ||
| 2162 | */ | ||
| 2163 | err = btrfs_subvolume_reserve_metadata(root, &block_rsv, | ||
| 2164 | 5, &qgroup_reserved); | ||
| 2165 | if (err) | ||
| 2166 | goto out_up_write; | ||
| 2167 | |||
| 2127 | trans = btrfs_start_transaction(root, 0); | 2168 | trans = btrfs_start_transaction(root, 0); |
| 2128 | if (IS_ERR(trans)) { | 2169 | if (IS_ERR(trans)) { |
| 2129 | err = PTR_ERR(trans); | 2170 | err = PTR_ERR(trans); |
| 2130 | goto out_up_write; | 2171 | goto out_release; |
| 2131 | } | 2172 | } |
| 2132 | trans->block_rsv = &root->fs_info->global_block_rsv; | 2173 | trans->block_rsv = &block_rsv; |
| 2174 | trans->bytes_reserved = block_rsv.size; | ||
| 2133 | 2175 | ||
| 2134 | ret = btrfs_unlink_subvol(trans, root, dir, | 2176 | ret = btrfs_unlink_subvol(trans, root, dir, |
| 2135 | dest->root_key.objectid, | 2177 | dest->root_key.objectid, |
| @@ -2159,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, | |||
| 2159 | } | 2201 | } |
| 2160 | } | 2202 | } |
| 2161 | out_end_trans: | 2203 | out_end_trans: |
| 2204 | trans->block_rsv = NULL; | ||
| 2205 | trans->bytes_reserved = 0; | ||
| 2162 | ret = btrfs_end_transaction(trans, root); | 2206 | ret = btrfs_end_transaction(trans, root); |
| 2163 | if (ret && !err) | 2207 | if (ret && !err) |
| 2164 | err = ret; | 2208 | err = ret; |
| 2165 | inode->i_flags |= S_DEAD; | 2209 | inode->i_flags |= S_DEAD; |
| 2210 | out_release: | ||
| 2211 | btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); | ||
| 2166 | out_up_write: | 2212 | out_up_write: |
| 2167 | up_write(&root->fs_info->subvol_sem); | 2213 | up_write(&root->fs_info->subvol_sem); |
| 2168 | out_unlock: | 2214 | out_unlock: |
| @@ -2171,6 +2217,12 @@ out_unlock: | |||
| 2171 | shrink_dcache_sb(root->fs_info->sb); | 2217 | shrink_dcache_sb(root->fs_info->sb); |
| 2172 | btrfs_invalidate_inodes(dest); | 2218 | btrfs_invalidate_inodes(dest); |
| 2173 | d_delete(dentry); | 2219 | d_delete(dentry); |
| 2220 | |||
| 2221 | /* the last ref */ | ||
| 2222 | if (dest->cache_inode) { | ||
| 2223 | iput(dest->cache_inode); | ||
| 2224 | dest->cache_inode = NULL; | ||
| 2225 | } | ||
| 2174 | } | 2226 | } |
| 2175 | out_dput: | 2227 | out_dput: |
| 2176 | dput(dentry); | 2228 | dput(dentry); |
| @@ -2211,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) | |||
| 2211 | ret = -EPERM; | 2263 | ret = -EPERM; |
| 2212 | goto out; | 2264 | goto out; |
| 2213 | } | 2265 | } |
| 2214 | ret = btrfs_defrag_root(root, 0); | 2266 | ret = btrfs_defrag_root(root); |
| 2215 | if (ret) | 2267 | if (ret) |
| 2216 | goto out; | 2268 | goto out; |
| 2217 | ret = btrfs_defrag_root(root->fs_info->extent_root, 0); | 2269 | ret = btrfs_defrag_root(root->fs_info->extent_root); |
| 2218 | break; | 2270 | break; |
| 2219 | case S_IFREG: | 2271 | case S_IFREG: |
| 2220 | if (!(file->f_mode & FMODE_WRITE)) { | 2272 | if (!(file->f_mode & FMODE_WRITE)) { |
| @@ -3111,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root, | |||
| 3111 | u64 transid; | 3163 | u64 transid; |
| 3112 | int ret; | 3164 | int ret; |
| 3113 | 3165 | ||
| 3114 | trans = btrfs_attach_transaction(root); | 3166 | trans = btrfs_attach_transaction_barrier(root); |
| 3115 | if (IS_ERR(trans)) { | 3167 | if (IS_ERR(trans)) { |
| 3116 | if (PTR_ERR(trans) != -ENOENT) | 3168 | if (PTR_ERR(trans) != -ENOENT) |
| 3117 | return PTR_ERR(trans); | 3169 | return PTR_ERR(trans); |
| @@ -3289,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg) | |||
| 3289 | struct inode_fs_paths *ipath = NULL; | 3341 | struct inode_fs_paths *ipath = NULL; |
| 3290 | struct btrfs_path *path; | 3342 | struct btrfs_path *path; |
| 3291 | 3343 | ||
| 3292 | if (!capable(CAP_SYS_ADMIN)) | 3344 | if (!capable(CAP_DAC_READ_SEARCH)) |
| 3293 | return -EPERM; | 3345 | return -EPERM; |
| 3294 | 3346 | ||
| 3295 | path = btrfs_alloc_path(); | 3347 | path = btrfs_alloc_path(); |
| @@ -3914,6 +3966,65 @@ out: | |||
| 3914 | return ret; | 3966 | return ret; |
| 3915 | } | 3967 | } |
| 3916 | 3968 | ||
| 3969 | static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg) | ||
| 3970 | { | ||
| 3971 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
| 3972 | const char *label = root->fs_info->super_copy->label; | ||
| 3973 | size_t len = strnlen(label, BTRFS_LABEL_SIZE); | ||
| 3974 | int ret; | ||
| 3975 | |||
| 3976 | if (len == BTRFS_LABEL_SIZE) { | ||
| 3977 | pr_warn("btrfs: label is too long, return the first %zu bytes\n", | ||
| 3978 | --len); | ||
| 3979 | } | ||
| 3980 | |||
| 3981 | mutex_lock(&root->fs_info->volume_mutex); | ||
| 3982 | ret = copy_to_user(arg, label, len); | ||
| 3983 | mutex_unlock(&root->fs_info->volume_mutex); | ||
| 3984 | |||
| 3985 | return ret ? -EFAULT : 0; | ||
| 3986 | } | ||
| 3987 | |||
| 3988 | static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg) | ||
| 3989 | { | ||
| 3990 | struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; | ||
| 3991 | struct btrfs_super_block *super_block = root->fs_info->super_copy; | ||
| 3992 | struct btrfs_trans_handle *trans; | ||
| 3993 | char label[BTRFS_LABEL_SIZE]; | ||
| 3994 | int ret; | ||
| 3995 | |||
| 3996 | if (!capable(CAP_SYS_ADMIN)) | ||
| 3997 | return -EPERM; | ||
| 3998 | |||
| 3999 | if (copy_from_user(label, arg, sizeof(label))) | ||
| 4000 | return -EFAULT; | ||
| 4001 | |||
| 4002 | if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) { | ||
| 4003 | pr_err("btrfs: unable to set label with more than %d bytes\n", | ||
| 4004 | BTRFS_LABEL_SIZE - 1); | ||
| 4005 | return -EINVAL; | ||
| 4006 | } | ||
| 4007 | |||
| 4008 | ret = mnt_want_write_file(file); | ||
| 4009 | if (ret) | ||
| 4010 | return ret; | ||
| 4011 | |||
| 4012 | mutex_lock(&root->fs_info->volume_mutex); | ||
| 4013 | trans = btrfs_start_transaction(root, 0); | ||
| 4014 | if (IS_ERR(trans)) { | ||
| 4015 | ret = PTR_ERR(trans); | ||
| 4016 | goto out_unlock; | ||
| 4017 | } | ||
| 4018 | |||
| 4019 | strcpy(super_block->label, label); | ||
| 4020 | ret = btrfs_end_transaction(trans, root); | ||
| 4021 | |||
| 4022 | out_unlock: | ||
| 4023 | mutex_unlock(&root->fs_info->volume_mutex); | ||
| 4024 | mnt_drop_write_file(file); | ||
| 4025 | return ret; | ||
| 4026 | } | ||
| 4027 | |||
| 3917 | long btrfs_ioctl(struct file *file, unsigned int | 4028 | long btrfs_ioctl(struct file *file, unsigned int |
| 3918 | cmd, unsigned long arg) | 4029 | cmd, unsigned long arg) |
| 3919 | { | 4030 | { |
| @@ -4014,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int | |||
| 4014 | return btrfs_ioctl_qgroup_limit(file, argp); | 4125 | return btrfs_ioctl_qgroup_limit(file, argp); |
| 4015 | case BTRFS_IOC_DEV_REPLACE: | 4126 | case BTRFS_IOC_DEV_REPLACE: |
| 4016 | return btrfs_ioctl_dev_replace(root, argp); | 4127 | return btrfs_ioctl_dev_replace(root, argp); |
| 4128 | case BTRFS_IOC_GET_FSLABEL: | ||
| 4129 | return btrfs_ioctl_get_fslabel(file, argp); | ||
| 4130 | case BTRFS_IOC_SET_FSLABEL: | ||
| 4131 | return btrfs_ioctl_set_fslabel(file, argp); | ||
| 4017 | } | 4132 | } |
| 4018 | 4133 | ||
| 4019 | return -ENOTTY; | 4134 | return -ENOTTY; |
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h deleted file mode 100644 index dabca9cc8c2e..000000000000 --- a/fs/btrfs/ioctl.h +++ /dev/null | |||
| @@ -1,502 +0,0 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2007 Oracle. All rights reserved. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or | ||
| 5 | * modify it under the terms of the GNU General Public | ||
| 6 | * License v2 as published by the Free Software Foundation. | ||
| 7 | * | ||
| 8 | * This program is distributed in the hope that it will be useful, | ||
| 9 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 10 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 11 | * General Public License for more details. | ||
| 12 | * | ||
| 13 | * You should have received a copy of the GNU General Public | ||
| 14 | * License along with this program; if not, write to the | ||
| 15 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
| 16 | * Boston, MA 021110-1307, USA. | ||
| 17 | */ | ||
| 18 | |||
| 19 | #ifndef __IOCTL_ | ||
| 20 | #define __IOCTL_ | ||
| 21 | #include <linux/ioctl.h> | ||
| 22 | |||
| 23 | #define BTRFS_IOCTL_MAGIC 0x94 | ||
| 24 | #define BTRFS_VOL_NAME_MAX 255 | ||
| 25 | |||
| 26 | /* this should be 4k */ | ||
| 27 | #define BTRFS_PATH_NAME_MAX 4087 | ||
| 28 | struct btrfs_ioctl_vol_args { | ||
| 29 | __s64 fd; | ||
| 30 | char name[BTRFS_PATH_NAME_MAX + 1]; | ||
| 31 | }; | ||
| 32 | |||
| 33 | #define BTRFS_DEVICE_PATH_NAME_MAX 1024 | ||
| 34 | |||
| 35 | #define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0) | ||
| 36 | #define BTRFS_SUBVOL_RDONLY (1ULL << 1) | ||
| 37 | #define BTRFS_SUBVOL_QGROUP_INHERIT (1ULL << 2) | ||
| 38 | #define BTRFS_FSID_SIZE 16 | ||
| 39 | #define BTRFS_UUID_SIZE 16 | ||
| 40 | |||
| 41 | #define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0) | ||
| 42 | |||
| 43 | struct btrfs_qgroup_limit { | ||
| 44 | __u64 flags; | ||
| 45 | __u64 max_rfer; | ||
| 46 | __u64 max_excl; | ||
| 47 | __u64 rsv_rfer; | ||
| 48 | __u64 rsv_excl; | ||
| 49 | }; | ||
| 50 | |||
| 51 | struct btrfs_qgroup_inherit { | ||
| 52 | __u64 flags; | ||
| 53 | __u64 num_qgroups; | ||
| 54 | __u64 num_ref_copies; | ||
| 55 | __u64 num_excl_copies; | ||
| 56 | struct btrfs_qgroup_limit lim; | ||
| 57 | __u64 qgroups[0]; | ||
| 58 | }; | ||
| 59 | |||
| 60 | struct btrfs_ioctl_qgroup_limit_args { | ||
| 61 | __u64 qgroupid; | ||
| 62 | struct btrfs_qgroup_limit lim; | ||
| 63 | }; | ||
| 64 | |||
| 65 | #define BTRFS_SUBVOL_NAME_MAX 4039 | ||
| 66 | struct btrfs_ioctl_vol_args_v2 { | ||
| 67 | __s64 fd; | ||
| 68 | __u64 transid; | ||
| 69 | __u64 flags; | ||
| 70 | union { | ||
| 71 | struct { | ||
| 72 | __u64 size; | ||
| 73 | struct btrfs_qgroup_inherit __user *qgroup_inherit; | ||
| 74 | }; | ||
| 75 | __u64 unused[4]; | ||
| 76 | }; | ||
| 77 | char name[BTRFS_SUBVOL_NAME_MAX + 1]; | ||
| 78 | }; | ||
| 79 | |||
| 80 | /* | ||
| 81 | * structure to report errors and progress to userspace, either as a | ||
| 82 | * result of a finished scrub, a canceled scrub or a progress inquiry | ||
| 83 | */ | ||
| 84 | struct btrfs_scrub_progress { | ||
| 85 | __u64 data_extents_scrubbed; /* # of data extents scrubbed */ | ||
| 86 | __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */ | ||
| 87 | __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */ | ||
| 88 | __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */ | ||
| 89 | __u64 read_errors; /* # of read errors encountered (EIO) */ | ||
| 90 | __u64 csum_errors; /* # of failed csum checks */ | ||
| 91 | __u64 verify_errors; /* # of occurences, where the metadata | ||
| 92 | * of a tree block did not match the | ||
| 93 | * expected values, like generation or | ||
| 94 | * logical */ | ||
| 95 | __u64 no_csum; /* # of 4k data block for which no csum | ||
| 96 | * is present, probably the result of | ||
| 97 | * data written with nodatasum */ | ||
| 98 | __u64 csum_discards; /* # of csum for which no data was found | ||
| 99 | * in the extent tree. */ | ||
| 100 | __u64 super_errors; /* # of bad super blocks encountered */ | ||
| 101 | __u64 malloc_errors; /* # of internal kmalloc errors. These | ||
| 102 | * will likely cause an incomplete | ||
| 103 | * scrub */ | ||
| 104 | __u64 uncorrectable_errors; /* # of errors where either no intact | ||
| 105 | * copy was found or the writeback | ||
| 106 | * failed */ | ||
| 107 | __u64 corrected_errors; /* # of errors corrected */ | ||
| 108 | __u64 last_physical; /* last physical address scrubbed. In | ||
| 109 | * case a scrub was aborted, this can | ||
| 110 | * be used to restart the scrub */ | ||
| 111 | __u64 unverified_errors; /* # of occurences where a read for a | ||
| 112 | * full (64k) bio failed, but the re- | ||
| 113 | * check succeeded for each 4k piece. | ||
| 114 | * Intermittent error. */ | ||
| 115 | }; | ||
| 116 | |||
| 117 | #define BTRFS_SCRUB_READONLY 1 | ||
| 118 | struct btrfs_ioctl_scrub_args { | ||
| 119 | __u64 devid; /* in */ | ||
| 120 | __u64 start; /* in */ | ||
| 121 | __u64 end; /* in */ | ||
| 122 | __u64 flags; /* in */ | ||
| 123 | struct btrfs_scrub_progress progress; /* out */ | ||
| 124 | /* pad to 1k */ | ||
| 125 | __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8]; | ||
| 126 | }; | ||
| 127 | |||
| 128 | #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0 | ||
| 129 | #define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID 1 | ||
| 130 | struct btrfs_ioctl_dev_replace_start_params { | ||
| 131 | __u64 srcdevid; /* in, if 0, use srcdev_name instead */ | ||
| 132 | __u64 cont_reading_from_srcdev_mode; /* in, see #define | ||
| 133 | * above */ | ||
| 134 | __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ | ||
| 135 | __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1]; /* in */ | ||
| 136 | }; | ||
| 137 | |||
| 138 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED 0 | ||
| 139 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED 1 | ||
| 140 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED 2 | ||
| 141 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED 3 | ||
| 142 | #define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED 4 | ||
| 143 | struct btrfs_ioctl_dev_replace_status_params { | ||
| 144 | __u64 replace_state; /* out, see #define above */ | ||
| 145 | __u64 progress_1000; /* out, 0 <= x <= 1000 */ | ||
| 146 | __u64 time_started; /* out, seconds since 1-Jan-1970 */ | ||
| 147 | __u64 time_stopped; /* out, seconds since 1-Jan-1970 */ | ||
| 148 | __u64 num_write_errors; /* out */ | ||
| 149 | __u64 num_uncorrectable_read_errors; /* out */ | ||
| 150 | }; | ||
| 151 | |||
| 152 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_START 0 | ||
| 153 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS 1 | ||
| 154 | #define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL 2 | ||
| 155 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR 0 | ||
| 156 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED 1 | ||
| 157 | #define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED 2 | ||
| 158 | struct btrfs_ioctl_dev_replace_args { | ||
| 159 | __u64 cmd; /* in */ | ||
| 160 | __u64 result; /* out */ | ||
| 161 | |||
| 162 | union { | ||
| 163 | struct btrfs_ioctl_dev_replace_start_params start; | ||
| 164 | struct btrfs_ioctl_dev_replace_status_params status; | ||
| 165 | }; /* in/out */ | ||
| 166 | |||
| 167 | __u64 spare[64]; | ||
| 168 | }; | ||
| 169 | |||
| 170 | struct btrfs_ioctl_dev_info_args { | ||
| 171 | __u64 devid; /* in/out */ | ||
| 172 | __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ | ||
| 173 | __u64 bytes_used; /* out */ | ||
| 174 | __u64 total_bytes; /* out */ | ||
| 175 | __u64 unused[379]; /* pad to 4k */ | ||
| 176 | __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ | ||
| 177 | }; | ||
| 178 | |||
| 179 | struct btrfs_ioctl_fs_info_args { | ||
| 180 | __u64 max_id; /* out */ | ||
| 181 | __u64 num_devices; /* out */ | ||
| 182 | __u8 fsid[BTRFS_FSID_SIZE]; /* out */ | ||
| 183 | __u64 reserved[124]; /* pad to 1k */ | ||
| 184 | }; | ||
| 185 | |||
| 186 | /* balance control ioctl modes */ | ||
| 187 | #define BTRFS_BALANCE_CTL_PAUSE 1 | ||
| 188 | #define BTRFS_BALANCE_CTL_CANCEL 2 | ||
| 189 | |||
| 190 | /* | ||
| 191 | * this is packed, because it should be exactly the same as its disk | ||
| 192 | * byte order counterpart (struct btrfs_disk_balance_args) | ||
| 193 | */ | ||
| 194 | struct btrfs_balance_args { | ||
| 195 | __u64 profiles; | ||
| 196 | __u64 usage; | ||
| 197 | __u64 devid; | ||
| 198 | __u64 pstart; | ||
| 199 | __u64 pend; | ||
| 200 | __u64 vstart; | ||
| 201 | __u64 vend; | ||
| 202 | |||
| 203 | __u64 target; | ||
| 204 | |||
| 205 | __u64 flags; | ||
| 206 | |||
| 207 | __u64 unused[8]; | ||
| 208 | } __attribute__ ((__packed__)); | ||
| 209 | |||
| 210 | /* report balance progress to userspace */ | ||
| 211 | struct btrfs_balance_progress { | ||
| 212 | __u64 expected; /* estimated # of chunks that will be | ||
| 213 | * relocated to fulfill the request */ | ||
| 214 | __u64 considered; /* # of chunks we have considered so far */ | ||
| 215 | __u64 completed; /* # of chunks relocated so far */ | ||
| 216 | }; | ||
| 217 | |||
| 218 | #define BTRFS_BALANCE_STATE_RUNNING (1ULL << 0) | ||
| 219 | #define BTRFS_BALANCE_STATE_PAUSE_REQ (1ULL << 1) | ||
| 220 | #define BTRFS_BALANCE_STATE_CANCEL_REQ (1ULL << 2) | ||
| 221 | |||
| 222 | struct btrfs_ioctl_balance_args { | ||
| 223 | __u64 flags; /* in/out */ | ||
| 224 | __u64 state; /* out */ | ||
| 225 | |||
| 226 | struct btrfs_balance_args data; /* in/out */ | ||
| 227 | struct btrfs_balance_args meta; /* in/out */ | ||
| 228 | struct btrfs_balance_args sys; /* in/out */ | ||
| 229 | |||
| 230 | struct btrfs_balance_progress stat; /* out */ | ||
| 231 | |||
| 232 | __u64 unused[72]; /* pad to 1k */ | ||
| 233 | }; | ||
| 234 | |||
| 235 | #define BTRFS_INO_LOOKUP_PATH_MAX 4080 | ||
| 236 | struct btrfs_ioctl_ino_lookup_args { | ||
| 237 | __u64 treeid; | ||
| 238 | __u64 objectid; | ||
| 239 | char name[BTRFS_INO_LOOKUP_PATH_MAX]; | ||
| 240 | }; | ||
| 241 | |||
| 242 | struct btrfs_ioctl_search_key { | ||
| 243 | /* which root are we searching. 0 is the tree of tree roots */ | ||
| 244 | __u64 tree_id; | ||
| 245 | |||
| 246 | /* keys returned will be >= min and <= max */ | ||
| 247 | __u64 min_objectid; | ||
| 248 | __u64 max_objectid; | ||
| 249 | |||
| 250 | /* keys returned will be >= min and <= max */ | ||
| 251 | __u64 min_offset; | ||
| 252 | __u64 max_offset; | ||
| 253 | |||
| 254 | /* max and min transids to search for */ | ||
| 255 | __u64 min_transid; | ||
| 256 | __u64 max_transid; | ||
| 257 | |||
| 258 | /* keys returned will be >= min and <= max */ | ||
| 259 | __u32 min_type; | ||
| 260 | __u32 max_type; | ||
| 261 | |||
| 262 | /* | ||
| 263 | * how many items did userland ask for, and how many are we | ||
| 264 | * returning | ||
| 265 | */ | ||
| 266 | __u32 nr_items; | ||
| 267 | |||
| 268 | /* align to 64 bits */ | ||
| 269 | __u32 unused; | ||
| 270 | |||
| 271 | /* some extra for later */ | ||
| 272 | __u64 unused1; | ||
| 273 | __u64 unused2; | ||
| 274 | __u64 unused3; | ||
| 275 | __u64 unused4; | ||
| 276 | }; | ||
| 277 | |||
| 278 | struct btrfs_ioctl_search_header { | ||
| 279 | __u64 transid; | ||
| 280 | __u64 objectid; | ||
| 281 | __u64 offset; | ||
| 282 | __u32 type; | ||
| 283 | __u32 len; | ||
| 284 | }; | ||
| 285 | |||
| 286 | #define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) | ||
| 287 | /* | ||
| 288 | * the buf is an array of search headers where | ||
| 289 | * each header is followed by the actual item | ||
| 290 | * the type field is expanded to 32 bits for alignment | ||
| 291 | */ | ||
| 292 | struct btrfs_ioctl_search_args { | ||
| 293 | struct btrfs_ioctl_search_key key; | ||
| 294 | char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; | ||
| 295 | }; | ||
| 296 | |||
| 297 | struct btrfs_ioctl_clone_range_args { | ||
| 298 | __s64 src_fd; | ||
| 299 | __u64 src_offset, src_length; | ||
| 300 | __u64 dest_offset; | ||
| 301 | }; | ||
| 302 | |||
| 303 | /* flags for the defrag range ioctl */ | ||
| 304 | #define BTRFS_DEFRAG_RANGE_COMPRESS 1 | ||
| 305 | #define BTRFS_DEFRAG_RANGE_START_IO 2 | ||
| 306 | |||
| 307 | struct btrfs_ioctl_space_info { | ||
| 308 | __u64 flags; | ||
| 309 | __u64 total_bytes; | ||
| 310 | __u64 used_bytes; | ||
| 311 | }; | ||
| 312 | |||
| 313 | struct btrfs_ioctl_space_args { | ||
| 314 | __u64 space_slots; | ||
| 315 | __u64 total_spaces; | ||
| 316 | struct btrfs_ioctl_space_info spaces[0]; | ||
| 317 | }; | ||
| 318 | |||
| 319 | struct btrfs_data_container { | ||
| 320 | __u32 bytes_left; /* out -- bytes not needed to deliver output */ | ||
| 321 | __u32 bytes_missing; /* out -- additional bytes needed for result */ | ||
| 322 | __u32 elem_cnt; /* out */ | ||
| 323 | __u32 elem_missed; /* out */ | ||
| 324 | __u64 val[0]; /* out */ | ||
| 325 | }; | ||
| 326 | |||
| 327 | struct btrfs_ioctl_ino_path_args { | ||
| 328 | __u64 inum; /* in */ | ||
| 329 | __u64 size; /* in */ | ||
| 330 | __u64 reserved[4]; | ||
| 331 | /* struct btrfs_data_container *fspath; out */ | ||
| 332 | __u64 fspath; /* out */ | ||
| 333 | }; | ||
| 334 | |||
| 335 | struct btrfs_ioctl_logical_ino_args { | ||
| 336 | __u64 logical; /* in */ | ||
| 337 | __u64 size; /* in */ | ||
| 338 | __u64 reserved[4]; | ||
| 339 | /* struct btrfs_data_container *inodes; out */ | ||
| 340 | __u64 inodes; | ||
| 341 | }; | ||
| 342 | |||
| 343 | enum btrfs_dev_stat_values { | ||
| 344 | /* disk I/O failure stats */ | ||
| 345 | BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */ | ||
| 346 | BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */ | ||
| 347 | BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */ | ||
| 348 | |||
| 349 | /* stats for indirect indications for I/O failures */ | ||
| 350 | BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or | ||
| 351 | * contents is illegal: this is an | ||
| 352 | * indication that the block was damaged | ||
| 353 | * during read or write, or written to | ||
| 354 | * wrong location or read from wrong | ||
| 355 | * location */ | ||
| 356 | BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not | ||
| 357 | * been written */ | ||
| 358 | |||
| 359 | BTRFS_DEV_STAT_VALUES_MAX | ||
| 360 | }; | ||
| 361 | |||
| 362 | /* Reset statistics after reading; needs SYS_ADMIN capability */ | ||
| 363 | #define BTRFS_DEV_STATS_RESET (1ULL << 0) | ||
| 364 | |||
| 365 | struct btrfs_ioctl_get_dev_stats { | ||
| 366 | __u64 devid; /* in */ | ||
| 367 | __u64 nr_items; /* in/out */ | ||
| 368 | __u64 flags; /* in/out */ | ||
| 369 | |||
| 370 | /* out values: */ | ||
| 371 | __u64 values[BTRFS_DEV_STAT_VALUES_MAX]; | ||
| 372 | |||
| 373 | __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */ | ||
| 374 | }; | ||
| 375 | |||
| 376 | #define BTRFS_QUOTA_CTL_ENABLE 1 | ||
| 377 | #define BTRFS_QUOTA_CTL_DISABLE 2 | ||
| 378 | #define BTRFS_QUOTA_CTL_RESCAN 3 | ||
| 379 | struct btrfs_ioctl_quota_ctl_args { | ||
| 380 | __u64 cmd; | ||
| 381 | __u64 status; | ||
| 382 | }; | ||
| 383 | |||
| 384 | struct btrfs_ioctl_qgroup_assign_args { | ||
| 385 | __u64 assign; | ||
| 386 | __u64 src; | ||
| 387 | __u64 dst; | ||
| 388 | }; | ||
| 389 | |||
| 390 | struct btrfs_ioctl_qgroup_create_args { | ||
| 391 | __u64 create; | ||
| 392 | __u64 qgroupid; | ||
| 393 | }; | ||
| 394 | struct btrfs_ioctl_timespec { | ||
| 395 | __u64 sec; | ||
| 396 | __u32 nsec; | ||
| 397 | }; | ||
| 398 | |||
| 399 | struct btrfs_ioctl_received_subvol_args { | ||
| 400 | char uuid[BTRFS_UUID_SIZE]; /* in */ | ||
| 401 | __u64 stransid; /* in */ | ||
| 402 | __u64 rtransid; /* out */ | ||
| 403 | struct btrfs_ioctl_timespec stime; /* in */ | ||
| 404 | struct btrfs_ioctl_timespec rtime; /* out */ | ||
| 405 | __u64 flags; /* in */ | ||
| 406 | __u64 reserved[16]; /* in */ | ||
| 407 | }; | ||
| 408 | |||
| 409 | struct btrfs_ioctl_send_args { | ||
| 410 | __s64 send_fd; /* in */ | ||
| 411 | __u64 clone_sources_count; /* in */ | ||
| 412 | __u64 __user *clone_sources; /* in */ | ||
| 413 | __u64 parent_root; /* in */ | ||
| 414 | __u64 flags; /* in */ | ||
| 415 | __u64 reserved[4]; /* in */ | ||
| 416 | }; | ||
| 417 | |||
| 418 | #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ | ||
| 419 | struct btrfs_ioctl_vol_args) | ||
| 420 | #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ | ||
| 421 | struct btrfs_ioctl_vol_args) | ||
| 422 | #define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ | ||
| 423 | struct btrfs_ioctl_vol_args) | ||
| 424 | #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ | ||
| 425 | struct btrfs_ioctl_vol_args) | ||
| 426 | /* trans start and trans end are dangerous, and only for | ||
| 427 | * use by applications that know how to avoid the | ||
| 428 | * resulting deadlocks | ||
| 429 | */ | ||
| 430 | #define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) | ||
| 431 | #define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) | ||
| 432 | #define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) | ||
| 433 | |||
| 434 | #define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) | ||
| 435 | #define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ | ||
| 436 | struct btrfs_ioctl_vol_args) | ||
| 437 | #define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ | ||
| 438 | struct btrfs_ioctl_vol_args) | ||
| 439 | #define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ | ||
| 440 | struct btrfs_ioctl_vol_args) | ||
| 441 | |||
| 442 | #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ | ||
| 443 | struct btrfs_ioctl_clone_range_args) | ||
| 444 | |||
| 445 | #define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ | ||
| 446 | struct btrfs_ioctl_vol_args) | ||
| 447 | #define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ | ||
| 448 | struct btrfs_ioctl_vol_args) | ||
| 449 | #define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \ | ||
| 450 | struct btrfs_ioctl_defrag_range_args) | ||
| 451 | #define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \ | ||
| 452 | struct btrfs_ioctl_search_args) | ||
| 453 | #define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \ | ||
| 454 | struct btrfs_ioctl_ino_lookup_args) | ||
| 455 | #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) | ||
| 456 | #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ | ||
| 457 | struct btrfs_ioctl_space_args) | ||
| 458 | #define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64) | ||
| 459 | #define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64) | ||
| 460 | #define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \ | ||
| 461 | struct btrfs_ioctl_vol_args_v2) | ||
| 462 | #define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \ | ||
| 463 | struct btrfs_ioctl_vol_args_v2) | ||
| 464 | #define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64) | ||
| 465 | #define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64) | ||
| 466 | #define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \ | ||
| 467 | struct btrfs_ioctl_scrub_args) | ||
| 468 | #define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28) | ||
| 469 | #define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \ | ||
| 470 | struct btrfs_ioctl_scrub_args) | ||
| 471 | #define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \ | ||
| 472 | struct btrfs_ioctl_dev_info_args) | ||
| 473 | #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ | ||
| 474 | struct btrfs_ioctl_fs_info_args) | ||
| 475 | #define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \ | ||
| 476 | struct btrfs_ioctl_balance_args) | ||
| 477 | #define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int) | ||
| 478 | #define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \ | ||
| 479 | struct btrfs_ioctl_balance_args) | ||
| 480 | #define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \ | ||
| 481 | struct btrfs_ioctl_ino_path_args) | ||
| 482 | #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \ | ||
| 483 | struct btrfs_ioctl_ino_path_args) | ||
| 484 | #define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \ | ||
| 485 | struct btrfs_ioctl_received_subvol_args) | ||
| 486 | #define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args) | ||
| 487 | #define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \ | ||
| 488 | struct btrfs_ioctl_vol_args) | ||
| 489 | #define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \ | ||
| 490 | struct btrfs_ioctl_quota_ctl_args) | ||
| 491 | #define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \ | ||
| 492 | struct btrfs_ioctl_qgroup_assign_args) | ||
| 493 | #define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \ | ||
| 494 | struct btrfs_ioctl_qgroup_create_args) | ||
| 495 | #define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \ | ||
| 496 | struct btrfs_ioctl_qgroup_limit_args) | ||
| 497 | #define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \ | ||
| 498 | struct btrfs_ioctl_get_dev_stats) | ||
| 499 | #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ | ||
| 500 | struct btrfs_ioctl_dev_replace_args) | ||
| 501 | |||
| 502 | #endif | ||
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 2a1762c66041..e95df435d897 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c | |||
| @@ -113,11 +113,10 @@ again: | |||
| 113 | read_unlock(&eb->lock); | 113 | read_unlock(&eb->lock); |
| 114 | return; | 114 | return; |
| 115 | } | 115 | } |
| 116 | read_unlock(&eb->lock); | ||
| 117 | wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0); | ||
| 118 | read_lock(&eb->lock); | ||
| 119 | if (atomic_read(&eb->blocking_writers)) { | 116 | if (atomic_read(&eb->blocking_writers)) { |
| 120 | read_unlock(&eb->lock); | 117 | read_unlock(&eb->lock); |
| 118 | wait_event(eb->write_lock_wq, | ||
| 119 | atomic_read(&eb->blocking_writers) == 0); | ||
| 121 | goto again; | 120 | goto again; |
| 122 | } | 121 | } |
| 123 | atomic_inc(&eb->read_locks); | 122 | atomic_inc(&eb->read_locks); |
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e5ed56729607..dc08d77b717e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c | |||
| @@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, | |||
| 196 | entry->file_offset = file_offset; | 196 | entry->file_offset = file_offset; |
| 197 | entry->start = start; | 197 | entry->start = start; |
| 198 | entry->len = len; | 198 | entry->len = len; |
| 199 | if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && | ||
| 200 | !(type == BTRFS_ORDERED_NOCOW)) | ||
| 201 | entry->csum_bytes_left = disk_len; | ||
| 199 | entry->disk_len = disk_len; | 202 | entry->disk_len = disk_len; |
| 200 | entry->bytes_left = len; | 203 | entry->bytes_left = len; |
| 201 | entry->inode = igrab(inode); | 204 | entry->inode = igrab(inode); |
| @@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, | |||
| 213 | INIT_LIST_HEAD(&entry->root_extent_list); | 216 | INIT_LIST_HEAD(&entry->root_extent_list); |
| 214 | INIT_LIST_HEAD(&entry->work_list); | 217 | INIT_LIST_HEAD(&entry->work_list); |
| 215 | init_completion(&entry->completion); | 218 | init_completion(&entry->completion); |
| 219 | INIT_LIST_HEAD(&entry->log_list); | ||
| 216 | 220 | ||
| 217 | trace_btrfs_ordered_extent_add(inode, entry); | 221 | trace_btrfs_ordered_extent_add(inode, entry); |
| 218 | 222 | ||
| @@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode, | |||
| 270 | tree = &BTRFS_I(inode)->ordered_tree; | 274 | tree = &BTRFS_I(inode)->ordered_tree; |
| 271 | spin_lock_irq(&tree->lock); | 275 | spin_lock_irq(&tree->lock); |
| 272 | list_add_tail(&sum->list, &entry->list); | 276 | list_add_tail(&sum->list, &entry->list); |
| 277 | WARN_ON(entry->csum_bytes_left < sum->len); | ||
| 278 | entry->csum_bytes_left -= sum->len; | ||
| 279 | if (entry->csum_bytes_left == 0) | ||
| 280 | wake_up(&entry->wait); | ||
| 273 | spin_unlock_irq(&tree->lock); | 281 | spin_unlock_irq(&tree->lock); |
| 274 | } | 282 | } |
| 275 | 283 | ||
| @@ -405,6 +413,66 @@ out: | |||
| 405 | return ret == 0; | 413 | return ret == 0; |
| 406 | } | 414 | } |
| 407 | 415 | ||
| 416 | /* Needs to either be called under a log transaction or the log_mutex */ | ||
| 417 | void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode) | ||
| 418 | { | ||
| 419 | struct btrfs_ordered_inode_tree *tree; | ||
| 420 | struct btrfs_ordered_extent *ordered; | ||
| 421 | struct rb_node *n; | ||
| 422 | int index = log->log_transid % 2; | ||
| 423 | |||
| 424 | tree = &BTRFS_I(inode)->ordered_tree; | ||
| 425 | spin_lock_irq(&tree->lock); | ||
| 426 | for (n = rb_first(&tree->tree); n; n = rb_next(n)) { | ||
| 427 | ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); | ||
| 428 | spin_lock(&log->log_extents_lock[index]); | ||
| 429 | if (list_empty(&ordered->log_list)) { | ||
| 430 | list_add_tail(&ordered->log_list, &log->logged_list[index]); | ||
| 431 | atomic_inc(&ordered->refs); | ||
| 432 | } | ||
| 433 | spin_unlock(&log->log_extents_lock[index]); | ||
| 434 | } | ||
| 435 | spin_unlock_irq(&tree->lock); | ||
| 436 | } | ||
| 437 | |||
| 438 | void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid) | ||
| 439 | { | ||
| 440 | struct btrfs_ordered_extent *ordered; | ||
| 441 | int index = transid % 2; | ||
| 442 | |||
| 443 | spin_lock_irq(&log->log_extents_lock[index]); | ||
| 444 | while (!list_empty(&log->logged_list[index])) { | ||
| 445 | ordered = list_first_entry(&log->logged_list[index], | ||
| 446 | struct btrfs_ordered_extent, | ||
| 447 | log_list); | ||
| 448 | list_del_init(&ordered->log_list); | ||
| 449 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
| 450 | wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, | ||
| 451 | &ordered->flags)); | ||
| 452 | btrfs_put_ordered_extent(ordered); | ||
| 453 | spin_lock_irq(&log->log_extents_lock[index]); | ||
| 454 | } | ||
| 455 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
| 456 | } | ||
| 457 | |||
| 458 | void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid) | ||
| 459 | { | ||
| 460 | struct btrfs_ordered_extent *ordered; | ||
| 461 | int index = transid % 2; | ||
| 462 | |||
| 463 | spin_lock_irq(&log->log_extents_lock[index]); | ||
| 464 | while (!list_empty(&log->logged_list[index])) { | ||
| 465 | ordered = list_first_entry(&log->logged_list[index], | ||
| 466 | struct btrfs_ordered_extent, | ||
| 467 | log_list); | ||
| 468 | list_del_init(&ordered->log_list); | ||
| 469 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
| 470 | btrfs_put_ordered_extent(ordered); | ||
| 471 | spin_lock_irq(&log->log_extents_lock[index]); | ||
| 472 | } | ||
| 473 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
| 474 | } | ||
| 475 | |||
| 408 | /* | 476 | /* |
| 409 | * used to drop a reference on an ordered extent. This will free | 477 | * used to drop a reference on an ordered extent. This will free |
| 410 | * the extent if the last reference is dropped | 478 | * the extent if the last reference is dropped |
| @@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput) | |||
| 544 | * extra check to make sure the ordered operation list really is empty | 612 | * extra check to make sure the ordered operation list really is empty |
| 545 | * before we return | 613 | * before we return |
| 546 | */ | 614 | */ |
| 547 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) | 615 | int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, |
| 616 | struct btrfs_root *root, int wait) | ||
| 548 | { | 617 | { |
| 549 | struct btrfs_inode *btrfs_inode; | 618 | struct btrfs_inode *btrfs_inode; |
| 550 | struct inode *inode; | 619 | struct inode *inode; |
| 620 | struct btrfs_transaction *cur_trans = trans->transaction; | ||
| 551 | struct list_head splice; | 621 | struct list_head splice; |
| 552 | struct list_head works; | 622 | struct list_head works; |
| 553 | struct btrfs_delalloc_work *work, *next; | 623 | struct btrfs_delalloc_work *work, *next; |
| @@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait) | |||
| 558 | 628 | ||
| 559 | mutex_lock(&root->fs_info->ordered_operations_mutex); | 629 | mutex_lock(&root->fs_info->ordered_operations_mutex); |
| 560 | spin_lock(&root->fs_info->ordered_extent_lock); | 630 | spin_lock(&root->fs_info->ordered_extent_lock); |
| 561 | again: | 631 | list_splice_init(&cur_trans->ordered_operations, &splice); |
| 562 | list_splice_init(&root->fs_info->ordered_operations, &splice); | ||
| 563 | |||
| 564 | while (!list_empty(&splice)) { | 632 | while (!list_empty(&splice)) { |
| 565 | |||
| 566 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, | 633 | btrfs_inode = list_entry(splice.next, struct btrfs_inode, |
| 567 | ordered_operations); | 634 | ordered_operations); |
| 568 | |||
| 569 | inode = &btrfs_inode->vfs_inode; | 635 | inode = &btrfs_inode->vfs_inode; |
| 570 | 636 | ||
| 571 | list_del_init(&btrfs_inode->ordered_operations); | 637 | list_del_init(&btrfs_inode->ordered_operations); |
| @@ -574,24 +640,22 @@ again: | |||
| 574 | * the inode may be getting freed (in sys_unlink path). | 640 | * the inode may be getting freed (in sys_unlink path). |
| 575 | */ | 641 | */ |
| 576 | inode = igrab(inode); | 642 | inode = igrab(inode); |
| 577 | |||
| 578 | if (!wait && inode) { | ||
| 579 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | ||
| 580 | &root->fs_info->ordered_operations); | ||
| 581 | } | ||
| 582 | |||
| 583 | if (!inode) | 643 | if (!inode) |
| 584 | continue; | 644 | continue; |
| 645 | |||
| 646 | if (!wait) | ||
| 647 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | ||
| 648 | &cur_trans->ordered_operations); | ||
| 585 | spin_unlock(&root->fs_info->ordered_extent_lock); | 649 | spin_unlock(&root->fs_info->ordered_extent_lock); |
| 586 | 650 | ||
| 587 | work = btrfs_alloc_delalloc_work(inode, wait, 1); | 651 | work = btrfs_alloc_delalloc_work(inode, wait, 1); |
| 588 | if (!work) { | 652 | if (!work) { |
| 653 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
| 589 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) | 654 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) |
| 590 | list_add_tail(&btrfs_inode->ordered_operations, | 655 | list_add_tail(&btrfs_inode->ordered_operations, |
| 591 | &splice); | 656 | &splice); |
| 592 | spin_lock(&root->fs_info->ordered_extent_lock); | ||
| 593 | list_splice_tail(&splice, | 657 | list_splice_tail(&splice, |
| 594 | &root->fs_info->ordered_operations); | 658 | &cur_trans->ordered_operations); |
| 595 | spin_unlock(&root->fs_info->ordered_extent_lock); | 659 | spin_unlock(&root->fs_info->ordered_extent_lock); |
| 596 | ret = -ENOMEM; | 660 | ret = -ENOMEM; |
| 597 | goto out; | 661 | goto out; |
| @@ -603,9 +667,6 @@ again: | |||
| 603 | cond_resched(); | 667 | cond_resched(); |
| 604 | spin_lock(&root->fs_info->ordered_extent_lock); | 668 | spin_lock(&root->fs_info->ordered_extent_lock); |
| 605 | } | 669 | } |
| 606 | if (wait && !list_empty(&root->fs_info->ordered_operations)) | ||
| 607 | goto again; | ||
| 608 | |||
| 609 | spin_unlock(&root->fs_info->ordered_extent_lock); | 670 | spin_unlock(&root->fs_info->ordered_extent_lock); |
| 610 | out: | 671 | out: |
| 611 | list_for_each_entry_safe(work, next, &works, list) { | 672 | list_for_each_entry_safe(work, next, &works, list) { |
| @@ -974,6 +1035,7 @@ out: | |||
| 974 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | 1035 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, |
| 975 | struct btrfs_root *root, struct inode *inode) | 1036 | struct btrfs_root *root, struct inode *inode) |
| 976 | { | 1037 | { |
| 1038 | struct btrfs_transaction *cur_trans = trans->transaction; | ||
| 977 | u64 last_mod; | 1039 | u64 last_mod; |
| 978 | 1040 | ||
| 979 | last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); | 1041 | last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans); |
| @@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | |||
| 988 | spin_lock(&root->fs_info->ordered_extent_lock); | 1050 | spin_lock(&root->fs_info->ordered_extent_lock); |
| 989 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) { | 1051 | if (list_empty(&BTRFS_I(inode)->ordered_operations)) { |
| 990 | list_add_tail(&BTRFS_I(inode)->ordered_operations, | 1052 | list_add_tail(&BTRFS_I(inode)->ordered_operations, |
| 991 | &root->fs_info->ordered_operations); | 1053 | &cur_trans->ordered_operations); |
| 992 | } | 1054 | } |
| 993 | spin_unlock(&root->fs_info->ordered_extent_lock); | 1055 | spin_unlock(&root->fs_info->ordered_extent_lock); |
| 994 | } | 1056 | } |
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f29d4bf5fbe7..8eadfe406cdd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h | |||
| @@ -79,6 +79,8 @@ struct btrfs_ordered_sum { | |||
| 79 | #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent | 79 | #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent |
| 80 | * has done its due diligence in updating | 80 | * has done its due diligence in updating |
| 81 | * the isize. */ | 81 | * the isize. */ |
| 82 | #define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered | ||
| 83 | ordered extent */ | ||
| 82 | 84 | ||
| 83 | struct btrfs_ordered_extent { | 85 | struct btrfs_ordered_extent { |
| 84 | /* logical offset in the file */ | 86 | /* logical offset in the file */ |
| @@ -96,6 +98,9 @@ struct btrfs_ordered_extent { | |||
| 96 | /* number of bytes that still need writing */ | 98 | /* number of bytes that still need writing */ |
| 97 | u64 bytes_left; | 99 | u64 bytes_left; |
| 98 | 100 | ||
| 101 | /* number of bytes that still need csumming */ | ||
| 102 | u64 csum_bytes_left; | ||
| 103 | |||
| 99 | /* | 104 | /* |
| 100 | * the end of the ordered extent which is behind it but | 105 | * the end of the ordered extent which is behind it but |
| 101 | * didn't update disk_i_size. Please see the comment of | 106 | * didn't update disk_i_size. Please see the comment of |
| @@ -118,6 +123,9 @@ struct btrfs_ordered_extent { | |||
| 118 | /* list of checksums for insertion when the extent io is done */ | 123 | /* list of checksums for insertion when the extent io is done */ |
| 119 | struct list_head list; | 124 | struct list_head list; |
| 120 | 125 | ||
| 126 | /* If we need to wait on this to be done */ | ||
| 127 | struct list_head log_list; | ||
| 128 | |||
| 121 | /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ | 129 | /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ |
| 122 | wait_queue_head_t wait; | 130 | wait_queue_head_t wait; |
| 123 | 131 | ||
| @@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, | |||
| 189 | int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, | 197 | int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, |
| 190 | struct btrfs_ordered_extent *ordered); | 198 | struct btrfs_ordered_extent *ordered); |
| 191 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); | 199 | int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); |
| 192 | int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); | 200 | int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans, |
| 201 | struct btrfs_root *root, int wait); | ||
| 193 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, | 202 | void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, |
| 194 | struct btrfs_root *root, | 203 | struct btrfs_root *root, |
| 195 | struct inode *inode); | 204 | struct inode *inode); |
| 196 | void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); | 205 | void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput); |
| 206 | void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode); | ||
| 207 | void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid); | ||
| 208 | void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid); | ||
| 197 | int __init ordered_data_init(void); | 209 | int __init ordered_data_init(void); |
| 198 | void ordered_data_exit(void); | 210 | void ordered_data_exit(void); |
| 199 | #endif | 211 | #endif |
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 50d95fd190a5..920957ecb27e 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c | |||
| @@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) | |||
| 294 | btrfs_dev_extent_chunk_offset(l, dev_extent), | 294 | btrfs_dev_extent_chunk_offset(l, dev_extent), |
| 295 | (unsigned long long) | 295 | (unsigned long long) |
| 296 | btrfs_dev_extent_length(l, dev_extent)); | 296 | btrfs_dev_extent_length(l, dev_extent)); |
| 297 | break; | ||
| 297 | case BTRFS_DEV_STATS_KEY: | 298 | case BTRFS_DEV_STATS_KEY: |
| 298 | printk(KERN_INFO "\t\tdevice stats\n"); | 299 | printk(KERN_INFO "\t\tdevice stats\n"); |
| 299 | break; | 300 | break; |
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a5c856234323..aee4b1cc3d98 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c | |||
| @@ -23,13 +23,13 @@ | |||
| 23 | #include <linux/rbtree.h> | 23 | #include <linux/rbtree.h> |
| 24 | #include <linux/slab.h> | 24 | #include <linux/slab.h> |
| 25 | #include <linux/workqueue.h> | 25 | #include <linux/workqueue.h> |
| 26 | #include <linux/btrfs.h> | ||
| 26 | 27 | ||
| 27 | #include "ctree.h" | 28 | #include "ctree.h" |
| 28 | #include "transaction.h" | 29 | #include "transaction.h" |
| 29 | #include "disk-io.h" | 30 | #include "disk-io.h" |
| 30 | #include "locking.h" | 31 | #include "locking.h" |
| 31 | #include "ulist.h" | 32 | #include "ulist.h" |
| 32 | #include "ioctl.h" | ||
| 33 | #include "backref.h" | 33 | #include "backref.h" |
| 34 | 34 | ||
| 35 | /* TODO XXX FIXME | 35 | /* TODO XXX FIXME |
| @@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans, | |||
| 620 | key.offset = qgroupid; | 620 | key.offset = qgroupid; |
| 621 | 621 | ||
| 622 | path = btrfs_alloc_path(); | 622 | path = btrfs_alloc_path(); |
| 623 | BUG_ON(!path); | 623 | if (!path) |
| 624 | return -ENOMEM; | ||
| 625 | |||
| 624 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | 626 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); |
| 625 | if (ret > 0) | 627 | if (ret > 0) |
| 626 | ret = -ENOENT; | 628 | ret = -ENOENT; |
| @@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, | |||
| 661 | key.offset = qgroup->qgroupid; | 663 | key.offset = qgroup->qgroupid; |
| 662 | 664 | ||
| 663 | path = btrfs_alloc_path(); | 665 | path = btrfs_alloc_path(); |
| 664 | BUG_ON(!path); | 666 | if (!path) |
| 667 | return -ENOMEM; | ||
| 668 | |||
| 665 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | 669 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); |
| 666 | if (ret > 0) | 670 | if (ret > 0) |
| 667 | ret = -ENOENT; | 671 | ret = -ENOENT; |
| @@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans, | |||
| 702 | key.offset = 0; | 706 | key.offset = 0; |
| 703 | 707 | ||
| 704 | path = btrfs_alloc_path(); | 708 | path = btrfs_alloc_path(); |
| 705 | BUG_ON(!path); | 709 | if (!path) |
| 710 | return -ENOMEM; | ||
| 711 | |||
| 706 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); | 712 | ret = btrfs_search_slot(trans, root, &key, path, 0, 1); |
| 707 | if (ret > 0) | 713 | if (ret > 0) |
| 708 | ret = -ENOENT; | 714 | ret = -ENOENT; |
| @@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, | |||
| 732 | { | 738 | { |
| 733 | struct btrfs_path *path; | 739 | struct btrfs_path *path; |
| 734 | struct btrfs_key key; | 740 | struct btrfs_key key; |
| 741 | struct extent_buffer *leaf = NULL; | ||
| 735 | int ret; | 742 | int ret; |
| 736 | 743 | int nr = 0; | |
| 737 | if (!root) | ||
| 738 | return -EINVAL; | ||
| 739 | 744 | ||
| 740 | path = btrfs_alloc_path(); | 745 | path = btrfs_alloc_path(); |
| 741 | if (!path) | 746 | if (!path) |
| 742 | return -ENOMEM; | 747 | return -ENOMEM; |
| 743 | 748 | ||
| 744 | while (1) { | 749 | path->leave_spinning = 1; |
| 745 | key.objectid = 0; | ||
| 746 | key.offset = 0; | ||
| 747 | key.type = 0; | ||
| 748 | 750 | ||
| 749 | path->leave_spinning = 1; | 751 | key.objectid = 0; |
| 752 | key.offset = 0; | ||
| 753 | key.type = 0; | ||
| 754 | |||
| 755 | while (1) { | ||
| 750 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); | 756 | ret = btrfs_search_slot(trans, root, &key, path, -1, 1); |
| 751 | if (ret > 0) { | 757 | if (ret < 0) |
| 752 | if (path->slots[0] == 0) | 758 | goto out; |
| 753 | break; | 759 | leaf = path->nodes[0]; |
| 754 | path->slots[0]--; | 760 | nr = btrfs_header_nritems(leaf); |
| 755 | } else if (ret < 0) { | 761 | if (!nr) |
| 756 | break; | 762 | break; |
| 757 | } | 763 | /* |
| 758 | 764 | * delete the leaf one by one | |
| 759 | ret = btrfs_del_item(trans, root, path); | 765 | * since the whole tree is going |
| 766 | * to be deleted. | ||
| 767 | */ | ||
| 768 | path->slots[0] = 0; | ||
| 769 | ret = btrfs_del_items(trans, root, path, 0, nr); | ||
| 760 | if (ret) | 770 | if (ret) |
| 761 | goto out; | 771 | goto out; |
| 772 | |||
| 762 | btrfs_release_path(path); | 773 | btrfs_release_path(path); |
| 763 | } | 774 | } |
| 764 | ret = 0; | 775 | ret = 0; |
| @@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans, | |||
| 847 | int ret = 0; | 858 | int ret = 0; |
| 848 | 859 | ||
| 849 | spin_lock(&fs_info->qgroup_lock); | 860 | spin_lock(&fs_info->qgroup_lock); |
| 861 | if (!fs_info->quota_root) { | ||
| 862 | spin_unlock(&fs_info->qgroup_lock); | ||
| 863 | return 0; | ||
| 864 | } | ||
| 850 | fs_info->quota_enabled = 0; | 865 | fs_info->quota_enabled = 0; |
| 851 | fs_info->pending_quota_state = 0; | 866 | fs_info->pending_quota_state = 0; |
| 852 | quota_root = fs_info->quota_root; | 867 | quota_root = fs_info->quota_root; |
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c new file mode 100644 index 000000000000..07222053c7d8 --- /dev/null +++ b/fs/btrfs/raid56.c | |||
| @@ -0,0 +1,2099 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Fusion-io All rights reserved. | ||
| 3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU General Public | ||
| 7 | * License v2 as published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 12 | * General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public | ||
| 15 | * License along with this program; if not, write to the | ||
| 16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
| 17 | * Boston, MA 021110-1307, USA. | ||
| 18 | */ | ||
| 19 | #include <linux/sched.h> | ||
| 20 | #include <linux/wait.h> | ||
| 21 | #include <linux/bio.h> | ||
| 22 | #include <linux/slab.h> | ||
| 23 | #include <linux/buffer_head.h> | ||
| 24 | #include <linux/blkdev.h> | ||
| 25 | #include <linux/random.h> | ||
| 26 | #include <linux/iocontext.h> | ||
| 27 | #include <linux/capability.h> | ||
| 28 | #include <linux/ratelimit.h> | ||
| 29 | #include <linux/kthread.h> | ||
| 30 | #include <linux/raid/pq.h> | ||
| 31 | #include <linux/hash.h> | ||
| 32 | #include <linux/list_sort.h> | ||
| 33 | #include <linux/raid/xor.h> | ||
| 34 | #include <asm/div64.h> | ||
| 35 | #include "compat.h" | ||
| 36 | #include "ctree.h" | ||
| 37 | #include "extent_map.h" | ||
| 38 | #include "disk-io.h" | ||
| 39 | #include "transaction.h" | ||
| 40 | #include "print-tree.h" | ||
| 41 | #include "volumes.h" | ||
| 42 | #include "raid56.h" | ||
| 43 | #include "async-thread.h" | ||
| 44 | #include "check-integrity.h" | ||
| 45 | #include "rcu-string.h" | ||
| 46 | |||
| 47 | /* set when additional merges to this rbio are not allowed */ | ||
| 48 | #define RBIO_RMW_LOCKED_BIT 1 | ||
| 49 | |||
| 50 | /* | ||
| 51 | * set when this rbio is sitting in the hash, but it is just a cache | ||
| 52 | * of past RMW | ||
| 53 | */ | ||
| 54 | #define RBIO_CACHE_BIT 2 | ||
| 55 | |||
| 56 | /* | ||
| 57 | * set when it is safe to trust the stripe_pages for caching | ||
| 58 | */ | ||
| 59 | #define RBIO_CACHE_READY_BIT 3 | ||
| 60 | |||
| 61 | |||
| 62 | #define RBIO_CACHE_SIZE 1024 | ||
| 63 | |||
| 64 | struct btrfs_raid_bio { | ||
| 65 | struct btrfs_fs_info *fs_info; | ||
| 66 | struct btrfs_bio *bbio; | ||
| 67 | |||
| 68 | /* | ||
| 69 | * logical block numbers for the start of each stripe | ||
| 70 | * The last one or two are p/q. These are sorted, | ||
| 71 | * so raid_map[0] is the start of our full stripe | ||
| 72 | */ | ||
| 73 | u64 *raid_map; | ||
| 74 | |||
| 75 | /* while we're doing rmw on a stripe | ||
| 76 | * we put it into a hash table so we can | ||
| 77 | * lock the stripe and merge more rbios | ||
| 78 | * into it. | ||
| 79 | */ | ||
| 80 | struct list_head hash_list; | ||
| 81 | |||
| 82 | /* | ||
| 83 | * LRU list for the stripe cache | ||
| 84 | */ | ||
| 85 | struct list_head stripe_cache; | ||
| 86 | |||
| 87 | /* | ||
| 88 | * for scheduling work in the helper threads | ||
| 89 | */ | ||
| 90 | struct btrfs_work work; | ||
| 91 | |||
| 92 | /* | ||
| 93 | * bio list and bio_list_lock are used | ||
| 94 | * to add more bios into the stripe | ||
| 95 | * in hopes of avoiding the full rmw | ||
| 96 | */ | ||
| 97 | struct bio_list bio_list; | ||
| 98 | spinlock_t bio_list_lock; | ||
| 99 | |||
| 100 | /* also protected by the bio_list_lock, the | ||
| 101 | * plug list is used by the plugging code | ||
| 102 | * to collect partial bios while plugged. The | ||
| 103 | * stripe locking code also uses it to hand off | ||
| 104 | * the stripe lock to the next pending IO | ||
| 105 | */ | ||
| 106 | struct list_head plug_list; | ||
| 107 | |||
| 108 | /* | ||
| 109 | * flags that tell us if it is safe to | ||
| 110 | * merge with this bio | ||
| 111 | */ | ||
| 112 | unsigned long flags; | ||
| 113 | |||
| 114 | /* size of each individual stripe on disk */ | ||
| 115 | int stripe_len; | ||
| 116 | |||
| 117 | /* number of data stripes (no p/q) */ | ||
| 118 | int nr_data; | ||
| 119 | |||
| 120 | /* | ||
| 121 | * set if we're doing a parity rebuild | ||
| 122 | * for a read from higher up, which is handled | ||
| 123 | * differently from a parity rebuild as part of | ||
| 124 | * rmw | ||
| 125 | */ | ||
| 126 | int read_rebuild; | ||
| 127 | |||
| 128 | /* first bad stripe */ | ||
| 129 | int faila; | ||
| 130 | |||
| 131 | /* second bad stripe (for raid6 use) */ | ||
| 132 | int failb; | ||
| 133 | |||
| 134 | /* | ||
| 135 | * number of pages needed to represent the full | ||
| 136 | * stripe | ||
| 137 | */ | ||
| 138 | int nr_pages; | ||
| 139 | |||
| 140 | /* | ||
| 141 | * size of all the bios in the bio_list. This | ||
| 142 | * helps us decide if the rbio maps to a full | ||
| 143 | * stripe or not | ||
| 144 | */ | ||
| 145 | int bio_list_bytes; | ||
| 146 | |||
| 147 | atomic_t refs; | ||
| 148 | |||
| 149 | /* | ||
| 150 | * these are two arrays of pointers. We allocate the | ||
| 151 | * rbio big enough to hold them both and setup their | ||
| 152 | * locations when the rbio is allocated | ||
| 153 | */ | ||
| 154 | |||
| 155 | /* pointers to pages that we allocated for | ||
| 156 | * reading/writing stripes directly from the disk (including P/Q) | ||
| 157 | */ | ||
| 158 | struct page **stripe_pages; | ||
| 159 | |||
| 160 | /* | ||
| 161 | * pointers to the pages in the bio_list. Stored | ||
| 162 | * here for faster lookup | ||
| 163 | */ | ||
| 164 | struct page **bio_pages; | ||
| 165 | }; | ||
| 166 | |||
| 167 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); | ||
| 168 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio); | ||
| 169 | static void rmw_work(struct btrfs_work *work); | ||
| 170 | static void read_rebuild_work(struct btrfs_work *work); | ||
| 171 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio); | ||
| 172 | static void async_read_rebuild(struct btrfs_raid_bio *rbio); | ||
| 173 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); | ||
| 174 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); | ||
| 175 | static void __free_raid_bio(struct btrfs_raid_bio *rbio); | ||
| 176 | static void index_rbio_pages(struct btrfs_raid_bio *rbio); | ||
| 177 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); | ||
| 178 | |||
| 179 | /* | ||
| 180 | * the stripe hash table is used for locking, and to collect | ||
| 181 | * bios in hopes of making a full stripe | ||
| 182 | */ | ||
| 183 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) | ||
| 184 | { | ||
| 185 | struct btrfs_stripe_hash_table *table; | ||
| 186 | struct btrfs_stripe_hash_table *x; | ||
| 187 | struct btrfs_stripe_hash *cur; | ||
| 188 | struct btrfs_stripe_hash *h; | ||
| 189 | int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; | ||
| 190 | int i; | ||
| 191 | int table_size; | ||
| 192 | |||
| 193 | if (info->stripe_hash_table) | ||
| 194 | return 0; | ||
| 195 | |||
| 196 | /* | ||
| 197 | * The table is large, starting with order 4 and can go as high as | ||
| 198 | * order 7 in case lock debugging is turned on. | ||
| 199 | * | ||
| 200 | * Try harder to allocate and fallback to vmalloc to lower the chance | ||
| 201 | * of a failing mount. | ||
| 202 | */ | ||
| 203 | table_size = sizeof(*table) + sizeof(*h) * num_entries; | ||
| 204 | table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); | ||
| 205 | if (!table) { | ||
| 206 | table = vzalloc(table_size); | ||
| 207 | if (!table) | ||
| 208 | return -ENOMEM; | ||
| 209 | } | ||
| 210 | |||
| 211 | spin_lock_init(&table->cache_lock); | ||
| 212 | INIT_LIST_HEAD(&table->stripe_cache); | ||
| 213 | |||
| 214 | h = table->table; | ||
| 215 | |||
| 216 | for (i = 0; i < num_entries; i++) { | ||
| 217 | cur = h + i; | ||
| 218 | INIT_LIST_HEAD(&cur->hash_list); | ||
| 219 | spin_lock_init(&cur->lock); | ||
| 220 | init_waitqueue_head(&cur->wait); | ||
| 221 | } | ||
| 222 | |||
| 223 | x = cmpxchg(&info->stripe_hash_table, NULL, table); | ||
| 224 | if (x) { | ||
| 225 | if (is_vmalloc_addr(x)) | ||
| 226 | vfree(x); | ||
| 227 | else | ||
| 228 | kfree(x); | ||
| 229 | } | ||
| 230 | return 0; | ||
| 231 | } | ||
| 232 | |||
| 233 | /* | ||
| 234 | * caching an rbio means to copy anything from the | ||
| 235 | * bio_pages array into the stripe_pages array. We | ||
| 236 | * use the page uptodate bit in the stripe cache array | ||
| 237 | * to indicate if it has valid data | ||
| 238 | * | ||
| 239 | * once the caching is done, we set the cache ready | ||
| 240 | * bit. | ||
| 241 | */ | ||
| 242 | static void cache_rbio_pages(struct btrfs_raid_bio *rbio) | ||
| 243 | { | ||
| 244 | int i; | ||
| 245 | char *s; | ||
| 246 | char *d; | ||
| 247 | int ret; | ||
| 248 | |||
| 249 | ret = alloc_rbio_pages(rbio); | ||
| 250 | if (ret) | ||
| 251 | return; | ||
| 252 | |||
| 253 | for (i = 0; i < rbio->nr_pages; i++) { | ||
| 254 | if (!rbio->bio_pages[i]) | ||
| 255 | continue; | ||
| 256 | |||
| 257 | s = kmap(rbio->bio_pages[i]); | ||
| 258 | d = kmap(rbio->stripe_pages[i]); | ||
| 259 | |||
| 260 | memcpy(d, s, PAGE_CACHE_SIZE); | ||
| 261 | |||
| 262 | kunmap(rbio->bio_pages[i]); | ||
| 263 | kunmap(rbio->stripe_pages[i]); | ||
| 264 | SetPageUptodate(rbio->stripe_pages[i]); | ||
| 265 | } | ||
| 266 | set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
| 267 | } | ||
| 268 | |||
| 269 | /* | ||
| 270 | * we hash on the first logical address of the stripe | ||
| 271 | */ | ||
| 272 | static int rbio_bucket(struct btrfs_raid_bio *rbio) | ||
| 273 | { | ||
| 274 | u64 num = rbio->raid_map[0]; | ||
| 275 | |||
| 276 | /* | ||
| 277 | * we shift down quite a bit. We're using byte | ||
| 278 | * addressing, and most of the lower bits are zeros. | ||
| 279 | * This tends to upset hash_64, and it consistently | ||
| 280 | * returns just one or two different values. | ||
| 281 | * | ||
| 282 | * shifting off the lower bits fixes things. | ||
| 283 | */ | ||
| 284 | return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); | ||
| 285 | } | ||
| 286 | |||
| 287 | /* | ||
| 288 | * stealing an rbio means taking all the uptodate pages from the stripe | ||
| 289 | * array in the source rbio and putting them into the destination rbio | ||
| 290 | */ | ||
| 291 | static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) | ||
| 292 | { | ||
| 293 | int i; | ||
| 294 | struct page *s; | ||
| 295 | struct page *d; | ||
| 296 | |||
| 297 | if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) | ||
| 298 | return; | ||
| 299 | |||
| 300 | for (i = 0; i < dest->nr_pages; i++) { | ||
| 301 | s = src->stripe_pages[i]; | ||
| 302 | if (!s || !PageUptodate(s)) { | ||
| 303 | continue; | ||
| 304 | } | ||
| 305 | |||
| 306 | d = dest->stripe_pages[i]; | ||
| 307 | if (d) | ||
| 308 | __free_page(d); | ||
| 309 | |||
| 310 | dest->stripe_pages[i] = s; | ||
| 311 | src->stripe_pages[i] = NULL; | ||
| 312 | } | ||
| 313 | } | ||
| 314 | |||
| 315 | /* | ||
| 316 | * merging means we take the bio_list from the victim and | ||
| 317 | * splice it into the destination. The victim should | ||
| 318 | * be discarded afterwards. | ||
| 319 | * | ||
| 320 | * must be called with dest->rbio_list_lock held | ||
| 321 | */ | ||
| 322 | static void merge_rbio(struct btrfs_raid_bio *dest, | ||
| 323 | struct btrfs_raid_bio *victim) | ||
| 324 | { | ||
| 325 | bio_list_merge(&dest->bio_list, &victim->bio_list); | ||
| 326 | dest->bio_list_bytes += victim->bio_list_bytes; | ||
| 327 | bio_list_init(&victim->bio_list); | ||
| 328 | } | ||
| 329 | |||
| 330 | /* | ||
| 331 | * used to prune items that are in the cache. The caller | ||
| 332 | * must hold the hash table lock. | ||
| 333 | */ | ||
| 334 | static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) | ||
| 335 | { | ||
| 336 | int bucket = rbio_bucket(rbio); | ||
| 337 | struct btrfs_stripe_hash_table *table; | ||
| 338 | struct btrfs_stripe_hash *h; | ||
| 339 | int freeit = 0; | ||
| 340 | |||
| 341 | /* | ||
| 342 | * check the bit again under the hash table lock. | ||
| 343 | */ | ||
| 344 | if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
| 345 | return; | ||
| 346 | |||
| 347 | table = rbio->fs_info->stripe_hash_table; | ||
| 348 | h = table->table + bucket; | ||
| 349 | |||
| 350 | /* hold the lock for the bucket because we may be | ||
| 351 | * removing it from the hash table | ||
| 352 | */ | ||
| 353 | spin_lock(&h->lock); | ||
| 354 | |||
| 355 | /* | ||
| 356 | * hold the lock for the bio list because we need | ||
| 357 | * to make sure the bio list is empty | ||
| 358 | */ | ||
| 359 | spin_lock(&rbio->bio_list_lock); | ||
| 360 | |||
| 361 | if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) { | ||
| 362 | list_del_init(&rbio->stripe_cache); | ||
| 363 | table->cache_size -= 1; | ||
| 364 | freeit = 1; | ||
| 365 | |||
| 366 | /* if the bio list isn't empty, this rbio is | ||
| 367 | * still involved in an IO. We take it out | ||
| 368 | * of the cache list, and drop the ref that | ||
| 369 | * was held for the list. | ||
| 370 | * | ||
| 371 | * If the bio_list was empty, we also remove | ||
| 372 | * the rbio from the hash_table, and drop | ||
| 373 | * the corresponding ref | ||
| 374 | */ | ||
| 375 | if (bio_list_empty(&rbio->bio_list)) { | ||
| 376 | if (!list_empty(&rbio->hash_list)) { | ||
| 377 | list_del_init(&rbio->hash_list); | ||
| 378 | atomic_dec(&rbio->refs); | ||
| 379 | BUG_ON(!list_empty(&rbio->plug_list)); | ||
| 380 | } | ||
| 381 | } | ||
| 382 | } | ||
| 383 | |||
| 384 | spin_unlock(&rbio->bio_list_lock); | ||
| 385 | spin_unlock(&h->lock); | ||
| 386 | |||
| 387 | if (freeit) | ||
| 388 | __free_raid_bio(rbio); | ||
| 389 | } | ||
| 390 | |||
| 391 | /* | ||
| 392 | * prune a given rbio from the cache | ||
| 393 | */ | ||
| 394 | static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio) | ||
| 395 | { | ||
| 396 | struct btrfs_stripe_hash_table *table; | ||
| 397 | unsigned long flags; | ||
| 398 | |||
| 399 | if (!test_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
| 400 | return; | ||
| 401 | |||
| 402 | table = rbio->fs_info->stripe_hash_table; | ||
| 403 | |||
| 404 | spin_lock_irqsave(&table->cache_lock, flags); | ||
| 405 | __remove_rbio_from_cache(rbio); | ||
| 406 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
| 407 | } | ||
| 408 | |||
| 409 | /* | ||
| 410 | * remove everything in the cache | ||
| 411 | */ | ||
| 412 | void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) | ||
| 413 | { | ||
| 414 | struct btrfs_stripe_hash_table *table; | ||
| 415 | unsigned long flags; | ||
| 416 | struct btrfs_raid_bio *rbio; | ||
| 417 | |||
| 418 | table = info->stripe_hash_table; | ||
| 419 | |||
| 420 | spin_lock_irqsave(&table->cache_lock, flags); | ||
| 421 | while (!list_empty(&table->stripe_cache)) { | ||
| 422 | rbio = list_entry(table->stripe_cache.next, | ||
| 423 | struct btrfs_raid_bio, | ||
| 424 | stripe_cache); | ||
| 425 | __remove_rbio_from_cache(rbio); | ||
| 426 | } | ||
| 427 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
| 428 | } | ||
| 429 | |||
| 430 | /* | ||
| 431 | * remove all cached entries and free the hash table | ||
| 432 | * used by unmount | ||
| 433 | */ | ||
| 434 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info) | ||
| 435 | { | ||
| 436 | if (!info->stripe_hash_table) | ||
| 437 | return; | ||
| 438 | btrfs_clear_rbio_cache(info); | ||
| 439 | if (is_vmalloc_addr(info->stripe_hash_table)) | ||
| 440 | vfree(info->stripe_hash_table); | ||
| 441 | else | ||
| 442 | kfree(info->stripe_hash_table); | ||
| 443 | info->stripe_hash_table = NULL; | ||
| 444 | } | ||
| 445 | |||
| 446 | /* | ||
| 447 | * insert an rbio into the stripe cache. It | ||
| 448 | * must have already been prepared by calling | ||
| 449 | * cache_rbio_pages | ||
| 450 | * | ||
| 451 | * If this rbio was already cached, it gets | ||
| 452 | * moved to the front of the lru. | ||
| 453 | * | ||
| 454 | * If the size of the rbio cache is too big, we | ||
| 455 | * prune an item. | ||
| 456 | */ | ||
| 457 | static void cache_rbio(struct btrfs_raid_bio *rbio) | ||
| 458 | { | ||
| 459 | struct btrfs_stripe_hash_table *table; | ||
| 460 | unsigned long flags; | ||
| 461 | |||
| 462 | if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags)) | ||
| 463 | return; | ||
| 464 | |||
| 465 | table = rbio->fs_info->stripe_hash_table; | ||
| 466 | |||
| 467 | spin_lock_irqsave(&table->cache_lock, flags); | ||
| 468 | spin_lock(&rbio->bio_list_lock); | ||
| 469 | |||
| 470 | /* bump our ref if we were not in the list before */ | ||
| 471 | if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) | ||
| 472 | atomic_inc(&rbio->refs); | ||
| 473 | |||
| 474 | if (!list_empty(&rbio->stripe_cache)){ | ||
| 475 | list_move(&rbio->stripe_cache, &table->stripe_cache); | ||
| 476 | } else { | ||
| 477 | list_add(&rbio->stripe_cache, &table->stripe_cache); | ||
| 478 | table->cache_size += 1; | ||
| 479 | } | ||
| 480 | |||
| 481 | spin_unlock(&rbio->bio_list_lock); | ||
| 482 | |||
| 483 | if (table->cache_size > RBIO_CACHE_SIZE) { | ||
| 484 | struct btrfs_raid_bio *found; | ||
| 485 | |||
| 486 | found = list_entry(table->stripe_cache.prev, | ||
| 487 | struct btrfs_raid_bio, | ||
| 488 | stripe_cache); | ||
| 489 | |||
| 490 | if (found != rbio) | ||
| 491 | __remove_rbio_from_cache(found); | ||
| 492 | } | ||
| 493 | |||
| 494 | spin_unlock_irqrestore(&table->cache_lock, flags); | ||
| 495 | return; | ||
| 496 | } | ||
| 497 | |||
| 498 | /* | ||
| 499 | * helper function to run the xor_blocks api. It is only | ||
| 500 | * able to do MAX_XOR_BLOCKS at a time, so we need to | ||
| 501 | * loop through. | ||
| 502 | */ | ||
| 503 | static void run_xor(void **pages, int src_cnt, ssize_t len) | ||
| 504 | { | ||
| 505 | int src_off = 0; | ||
| 506 | int xor_src_cnt = 0; | ||
| 507 | void *dest = pages[src_cnt]; | ||
| 508 | |||
| 509 | while(src_cnt > 0) { | ||
| 510 | xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS); | ||
| 511 | xor_blocks(xor_src_cnt, len, dest, pages + src_off); | ||
| 512 | |||
| 513 | src_cnt -= xor_src_cnt; | ||
| 514 | src_off += xor_src_cnt; | ||
| 515 | } | ||
| 516 | } | ||
| 517 | |||
| 518 | /* | ||
| 519 | * returns true if the bio list inside this rbio | ||
| 520 | * covers an entire stripe (no rmw required). | ||
| 521 | * Must be called with the bio list lock held, or | ||
| 522 | * at a time when you know it is impossible to add | ||
| 523 | * new bios into the list | ||
| 524 | */ | ||
| 525 | static int __rbio_is_full(struct btrfs_raid_bio *rbio) | ||
| 526 | { | ||
| 527 | unsigned long size = rbio->bio_list_bytes; | ||
| 528 | int ret = 1; | ||
| 529 | |||
| 530 | if (size != rbio->nr_data * rbio->stripe_len) | ||
| 531 | ret = 0; | ||
| 532 | |||
| 533 | BUG_ON(size > rbio->nr_data * rbio->stripe_len); | ||
| 534 | return ret; | ||
| 535 | } | ||
| 536 | |||
| 537 | static int rbio_is_full(struct btrfs_raid_bio *rbio) | ||
| 538 | { | ||
| 539 | unsigned long flags; | ||
| 540 | int ret; | ||
| 541 | |||
| 542 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | ||
| 543 | ret = __rbio_is_full(rbio); | ||
| 544 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | ||
| 545 | return ret; | ||
| 546 | } | ||
| 547 | |||
| 548 | /* | ||
| 549 | * returns 1 if it is safe to merge two rbios together. | ||
| 550 | * The merging is safe if the two rbios correspond to | ||
| 551 | * the same stripe and if they are both going in the same | ||
| 552 | * direction (read vs write), and if neither one is | ||
| 553 | * locked for final IO | ||
| 554 | * | ||
| 555 | * The caller is responsible for locking such that | ||
| 556 | * rmw_locked is safe to test | ||
| 557 | */ | ||
| 558 | static int rbio_can_merge(struct btrfs_raid_bio *last, | ||
| 559 | struct btrfs_raid_bio *cur) | ||
| 560 | { | ||
| 561 | if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) || | ||
| 562 | test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) | ||
| 563 | return 0; | ||
| 564 | |||
| 565 | /* | ||
| 566 | * we can't merge with cached rbios, since the | ||
| 567 | * idea is that when we merge the destination | ||
| 568 | * rbio is going to run our IO for us. We can | ||
| 569 | * steal from cached rbio's though, other functions | ||
| 570 | * handle that. | ||
| 571 | */ | ||
| 572 | if (test_bit(RBIO_CACHE_BIT, &last->flags) || | ||
| 573 | test_bit(RBIO_CACHE_BIT, &cur->flags)) | ||
| 574 | return 0; | ||
| 575 | |||
| 576 | if (last->raid_map[0] != | ||
| 577 | cur->raid_map[0]) | ||
| 578 | return 0; | ||
| 579 | |||
| 580 | /* reads can't merge with writes */ | ||
| 581 | if (last->read_rebuild != | ||
| 582 | cur->read_rebuild) { | ||
| 583 | return 0; | ||
| 584 | } | ||
| 585 | |||
| 586 | return 1; | ||
| 587 | } | ||
| 588 | |||
| 589 | /* | ||
| 590 | * helper to index into the pstripe | ||
| 591 | */ | ||
| 592 | static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) | ||
| 593 | { | ||
| 594 | index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | ||
| 595 | return rbio->stripe_pages[index]; | ||
| 596 | } | ||
| 597 | |||
| 598 | /* | ||
| 599 | * helper to index into the qstripe, returns null | ||
| 600 | * if there is no qstripe | ||
| 601 | */ | ||
| 602 | static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) | ||
| 603 | { | ||
| 604 | if (rbio->nr_data + 1 == rbio->bbio->num_stripes) | ||
| 605 | return NULL; | ||
| 606 | |||
| 607 | index += ((rbio->nr_data + 1) * rbio->stripe_len) >> | ||
| 608 | PAGE_CACHE_SHIFT; | ||
| 609 | return rbio->stripe_pages[index]; | ||
| 610 | } | ||
| 611 | |||
| 612 | /* | ||
| 613 | * The first stripe in the table for a logical address | ||
| 614 | * has the lock. rbios are added in one of three ways: | ||
| 615 | * | ||
| 616 | * 1) Nobody has the stripe locked yet. The rbio is given | ||
| 617 | * the lock and 0 is returned. The caller must start the IO | ||
| 618 | * themselves. | ||
| 619 | * | ||
| 620 | * 2) Someone has the stripe locked, but we're able to merge | ||
| 621 | * with the lock owner. The rbio is freed and the IO will | ||
| 622 | * start automatically along with the existing rbio. 1 is returned. | ||
| 623 | * | ||
| 624 | * 3) Someone has the stripe locked, but we're not able to merge. | ||
| 625 | * The rbio is added to the lock owner's plug list, or merged into | ||
| 626 | * an rbio already on the plug list. When the lock owner unlocks, | ||
| 627 | * the next rbio on the list is run and the IO is started automatically. | ||
| 628 | * 1 is returned | ||
| 629 | * | ||
| 630 | * If we return 0, the caller still owns the rbio and must continue with | ||
| 631 | * IO submission. If we return 1, the caller must assume the rbio has | ||
| 632 | * already been freed. | ||
| 633 | */ | ||
| 634 | static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) | ||
| 635 | { | ||
| 636 | int bucket = rbio_bucket(rbio); | ||
| 637 | struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket; | ||
| 638 | struct btrfs_raid_bio *cur; | ||
| 639 | struct btrfs_raid_bio *pending; | ||
| 640 | unsigned long flags; | ||
| 641 | DEFINE_WAIT(wait); | ||
| 642 | struct btrfs_raid_bio *freeit = NULL; | ||
| 643 | struct btrfs_raid_bio *cache_drop = NULL; | ||
| 644 | int ret = 0; | ||
| 645 | int walk = 0; | ||
| 646 | |||
| 647 | spin_lock_irqsave(&h->lock, flags); | ||
| 648 | list_for_each_entry(cur, &h->hash_list, hash_list) { | ||
| 649 | walk++; | ||
| 650 | if (cur->raid_map[0] == rbio->raid_map[0]) { | ||
| 651 | spin_lock(&cur->bio_list_lock); | ||
| 652 | |||
| 653 | /* can we steal this cached rbio's pages? */ | ||
| 654 | if (bio_list_empty(&cur->bio_list) && | ||
| 655 | list_empty(&cur->plug_list) && | ||
| 656 | test_bit(RBIO_CACHE_BIT, &cur->flags) && | ||
| 657 | !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { | ||
| 658 | list_del_init(&cur->hash_list); | ||
| 659 | atomic_dec(&cur->refs); | ||
| 660 | |||
| 661 | steal_rbio(cur, rbio); | ||
| 662 | cache_drop = cur; | ||
| 663 | spin_unlock(&cur->bio_list_lock); | ||
| 664 | |||
| 665 | goto lockit; | ||
| 666 | } | ||
| 667 | |||
| 668 | /* can we merge into the lock owner? */ | ||
| 669 | if (rbio_can_merge(cur, rbio)) { | ||
| 670 | merge_rbio(cur, rbio); | ||
| 671 | spin_unlock(&cur->bio_list_lock); | ||
| 672 | freeit = rbio; | ||
| 673 | ret = 1; | ||
| 674 | goto out; | ||
| 675 | } | ||
| 676 | |||
| 677 | |||
| 678 | /* | ||
| 679 | * we couldn't merge with the running | ||
| 680 | * rbio, see if we can merge with the | ||
| 681 | * pending ones. We don't have to | ||
| 682 | * check for rmw_locked because there | ||
| 683 | * is no way they are inside finish_rmw | ||
| 684 | * right now | ||
| 685 | */ | ||
| 686 | list_for_each_entry(pending, &cur->plug_list, | ||
| 687 | plug_list) { | ||
| 688 | if (rbio_can_merge(pending, rbio)) { | ||
| 689 | merge_rbio(pending, rbio); | ||
| 690 | spin_unlock(&cur->bio_list_lock); | ||
| 691 | freeit = rbio; | ||
| 692 | ret = 1; | ||
| 693 | goto out; | ||
| 694 | } | ||
| 695 | } | ||
| 696 | |||
| 697 | /* no merging, put us on the tail of the plug list, | ||
| 698 | * our rbio will be started with the currently | ||
| 699 | * running rbio unlocks | ||
| 700 | */ | ||
| 701 | list_add_tail(&rbio->plug_list, &cur->plug_list); | ||
| 702 | spin_unlock(&cur->bio_list_lock); | ||
| 703 | ret = 1; | ||
| 704 | goto out; | ||
| 705 | } | ||
| 706 | } | ||
| 707 | lockit: | ||
| 708 | atomic_inc(&rbio->refs); | ||
| 709 | list_add(&rbio->hash_list, &h->hash_list); | ||
| 710 | out: | ||
| 711 | spin_unlock_irqrestore(&h->lock, flags); | ||
| 712 | if (cache_drop) | ||
| 713 | remove_rbio_from_cache(cache_drop); | ||
| 714 | if (freeit) | ||
| 715 | __free_raid_bio(freeit); | ||
| 716 | return ret; | ||
| 717 | } | ||
| 718 | |||
| 719 | /* | ||
| 720 | * called as rmw or parity rebuild is completed. If the plug list has more | ||
| 721 | * rbios waiting for this stripe, the next one on the list will be started | ||
| 722 | */ | ||
| 723 | static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) | ||
| 724 | { | ||
| 725 | int bucket; | ||
| 726 | struct btrfs_stripe_hash *h; | ||
| 727 | unsigned long flags; | ||
| 728 | int keep_cache = 0; | ||
| 729 | |||
| 730 | bucket = rbio_bucket(rbio); | ||
| 731 | h = rbio->fs_info->stripe_hash_table->table + bucket; | ||
| 732 | |||
| 733 | if (list_empty(&rbio->plug_list)) | ||
| 734 | cache_rbio(rbio); | ||
| 735 | |||
| 736 | spin_lock_irqsave(&h->lock, flags); | ||
| 737 | spin_lock(&rbio->bio_list_lock); | ||
| 738 | |||
| 739 | if (!list_empty(&rbio->hash_list)) { | ||
| 740 | /* | ||
| 741 | * if we're still cached and there is no other IO | ||
| 742 | * to perform, just leave this rbio here for others | ||
| 743 | * to steal from later | ||
| 744 | */ | ||
| 745 | if (list_empty(&rbio->plug_list) && | ||
| 746 | test_bit(RBIO_CACHE_BIT, &rbio->flags)) { | ||
| 747 | keep_cache = 1; | ||
| 748 | clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
| 749 | BUG_ON(!bio_list_empty(&rbio->bio_list)); | ||
| 750 | goto done; | ||
| 751 | } | ||
| 752 | |||
| 753 | list_del_init(&rbio->hash_list); | ||
| 754 | atomic_dec(&rbio->refs); | ||
| 755 | |||
| 756 | /* | ||
| 757 | * we use the plug list to hold all the rbios | ||
| 758 | * waiting for the chance to lock this stripe. | ||
| 759 | * hand the lock over to one of them. | ||
| 760 | */ | ||
| 761 | if (!list_empty(&rbio->plug_list)) { | ||
| 762 | struct btrfs_raid_bio *next; | ||
| 763 | struct list_head *head = rbio->plug_list.next; | ||
| 764 | |||
| 765 | next = list_entry(head, struct btrfs_raid_bio, | ||
| 766 | plug_list); | ||
| 767 | |||
| 768 | list_del_init(&rbio->plug_list); | ||
| 769 | |||
| 770 | list_add(&next->hash_list, &h->hash_list); | ||
| 771 | atomic_inc(&next->refs); | ||
| 772 | spin_unlock(&rbio->bio_list_lock); | ||
| 773 | spin_unlock_irqrestore(&h->lock, flags); | ||
| 774 | |||
| 775 | if (next->read_rebuild) | ||
| 776 | async_read_rebuild(next); | ||
| 777 | else { | ||
| 778 | steal_rbio(rbio, next); | ||
| 779 | async_rmw_stripe(next); | ||
| 780 | } | ||
| 781 | |||
| 782 | goto done_nolock; | ||
| 783 | } else if (waitqueue_active(&h->wait)) { | ||
| 784 | spin_unlock(&rbio->bio_list_lock); | ||
| 785 | spin_unlock_irqrestore(&h->lock, flags); | ||
| 786 | wake_up(&h->wait); | ||
| 787 | goto done_nolock; | ||
| 788 | } | ||
| 789 | } | ||
| 790 | done: | ||
| 791 | spin_unlock(&rbio->bio_list_lock); | ||
| 792 | spin_unlock_irqrestore(&h->lock, flags); | ||
| 793 | |||
| 794 | done_nolock: | ||
| 795 | if (!keep_cache) | ||
| 796 | remove_rbio_from_cache(rbio); | ||
| 797 | } | ||
| 798 | |||
| 799 | static void __free_raid_bio(struct btrfs_raid_bio *rbio) | ||
| 800 | { | ||
| 801 | int i; | ||
| 802 | |||
| 803 | WARN_ON(atomic_read(&rbio->refs) < 0); | ||
| 804 | if (!atomic_dec_and_test(&rbio->refs)) | ||
| 805 | return; | ||
| 806 | |||
| 807 | WARN_ON(!list_empty(&rbio->stripe_cache)); | ||
| 808 | WARN_ON(!list_empty(&rbio->hash_list)); | ||
| 809 | WARN_ON(!bio_list_empty(&rbio->bio_list)); | ||
| 810 | |||
| 811 | for (i = 0; i < rbio->nr_pages; i++) { | ||
| 812 | if (rbio->stripe_pages[i]) { | ||
| 813 | __free_page(rbio->stripe_pages[i]); | ||
| 814 | rbio->stripe_pages[i] = NULL; | ||
| 815 | } | ||
| 816 | } | ||
| 817 | kfree(rbio->raid_map); | ||
| 818 | kfree(rbio->bbio); | ||
| 819 | kfree(rbio); | ||
| 820 | } | ||
| 821 | |||
| 822 | static void free_raid_bio(struct btrfs_raid_bio *rbio) | ||
| 823 | { | ||
| 824 | unlock_stripe(rbio); | ||
| 825 | __free_raid_bio(rbio); | ||
| 826 | } | ||
| 827 | |||
| 828 | /* | ||
| 829 | * this frees the rbio and runs through all the bios in the | ||
| 830 | * bio_list and calls end_io on them | ||
| 831 | */ | ||
| 832 | static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate) | ||
| 833 | { | ||
| 834 | struct bio *cur = bio_list_get(&rbio->bio_list); | ||
| 835 | struct bio *next; | ||
| 836 | free_raid_bio(rbio); | ||
| 837 | |||
| 838 | while (cur) { | ||
| 839 | next = cur->bi_next; | ||
| 840 | cur->bi_next = NULL; | ||
| 841 | if (uptodate) | ||
| 842 | set_bit(BIO_UPTODATE, &cur->bi_flags); | ||
| 843 | bio_endio(cur, err); | ||
| 844 | cur = next; | ||
| 845 | } | ||
| 846 | } | ||
| 847 | |||
| 848 | /* | ||
| 849 | * end io function used by finish_rmw. When we finally | ||
| 850 | * get here, we've written a full stripe | ||
| 851 | */ | ||
| 852 | static void raid_write_end_io(struct bio *bio, int err) | ||
| 853 | { | ||
| 854 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
| 855 | |||
| 856 | if (err) | ||
| 857 | fail_bio_stripe(rbio, bio); | ||
| 858 | |||
| 859 | bio_put(bio); | ||
| 860 | |||
| 861 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
| 862 | return; | ||
| 863 | |||
| 864 | err = 0; | ||
| 865 | |||
| 866 | /* OK, we have read all the stripes we need to. */ | ||
| 867 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
| 868 | err = -EIO; | ||
| 869 | |||
| 870 | rbio_orig_end_io(rbio, err, 0); | ||
| 871 | return; | ||
| 872 | } | ||
| 873 | |||
| 874 | /* | ||
| 875 | * the read/modify/write code wants to use the original bio for | ||
| 876 | * any pages it included, and then use the rbio for everything | ||
| 877 | * else. This function decides if a given index (stripe number) | ||
| 878 | * and page number in that stripe fall inside the original bio | ||
| 879 | * or the rbio. | ||
| 880 | * | ||
| 881 | * if you set bio_list_only, you'll get a NULL back for any ranges | ||
| 882 | * that are outside the bio_list | ||
| 883 | * | ||
| 884 | * This doesn't take any refs on anything, you get a bare page pointer | ||
| 885 | * and the caller must bump refs as required. | ||
| 886 | * | ||
| 887 | * You must call index_rbio_pages once before you can trust | ||
| 888 | * the answers from this function. | ||
| 889 | */ | ||
| 890 | static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, | ||
| 891 | int index, int pagenr, int bio_list_only) | ||
| 892 | { | ||
| 893 | int chunk_page; | ||
| 894 | struct page *p = NULL; | ||
| 895 | |||
| 896 | chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; | ||
| 897 | |||
| 898 | spin_lock_irq(&rbio->bio_list_lock); | ||
| 899 | p = rbio->bio_pages[chunk_page]; | ||
| 900 | spin_unlock_irq(&rbio->bio_list_lock); | ||
| 901 | |||
| 902 | if (p || bio_list_only) | ||
| 903 | return p; | ||
| 904 | |||
| 905 | return rbio->stripe_pages[chunk_page]; | ||
| 906 | } | ||
| 907 | |||
| 908 | /* | ||
| 909 | * number of pages we need for the entire stripe across all the | ||
| 910 | * drives | ||
| 911 | */ | ||
| 912 | static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) | ||
| 913 | { | ||
| 914 | unsigned long nr = stripe_len * nr_stripes; | ||
| 915 | return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 916 | } | ||
| 917 | |||
| 918 | /* | ||
| 919 | * allocation and initial setup for the btrfs_raid_bio. Not | ||
| 920 | * this does not allocate any pages for rbio->pages. | ||
| 921 | */ | ||
| 922 | static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, | ||
| 923 | struct btrfs_bio *bbio, u64 *raid_map, | ||
| 924 | u64 stripe_len) | ||
| 925 | { | ||
| 926 | struct btrfs_raid_bio *rbio; | ||
| 927 | int nr_data = 0; | ||
| 928 | int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes); | ||
| 929 | void *p; | ||
| 930 | |||
| 931 | rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2, | ||
| 932 | GFP_NOFS); | ||
| 933 | if (!rbio) { | ||
| 934 | kfree(raid_map); | ||
| 935 | kfree(bbio); | ||
| 936 | return ERR_PTR(-ENOMEM); | ||
| 937 | } | ||
| 938 | |||
| 939 | bio_list_init(&rbio->bio_list); | ||
| 940 | INIT_LIST_HEAD(&rbio->plug_list); | ||
| 941 | spin_lock_init(&rbio->bio_list_lock); | ||
| 942 | INIT_LIST_HEAD(&rbio->stripe_cache); | ||
| 943 | INIT_LIST_HEAD(&rbio->hash_list); | ||
| 944 | rbio->bbio = bbio; | ||
| 945 | rbio->raid_map = raid_map; | ||
| 946 | rbio->fs_info = root->fs_info; | ||
| 947 | rbio->stripe_len = stripe_len; | ||
| 948 | rbio->nr_pages = num_pages; | ||
| 949 | rbio->faila = -1; | ||
| 950 | rbio->failb = -1; | ||
| 951 | atomic_set(&rbio->refs, 1); | ||
| 952 | |||
| 953 | /* | ||
| 954 | * the stripe_pages and bio_pages array point to the extra | ||
| 955 | * memory we allocated past the end of the rbio | ||
| 956 | */ | ||
| 957 | p = rbio + 1; | ||
| 958 | rbio->stripe_pages = p; | ||
| 959 | rbio->bio_pages = p + sizeof(struct page *) * num_pages; | ||
| 960 | |||
| 961 | if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE) | ||
| 962 | nr_data = bbio->num_stripes - 2; | ||
| 963 | else | ||
| 964 | nr_data = bbio->num_stripes - 1; | ||
| 965 | |||
| 966 | rbio->nr_data = nr_data; | ||
| 967 | return rbio; | ||
| 968 | } | ||
| 969 | |||
| 970 | /* allocate pages for all the stripes in the bio, including parity */ | ||
| 971 | static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) | ||
| 972 | { | ||
| 973 | int i; | ||
| 974 | struct page *page; | ||
| 975 | |||
| 976 | for (i = 0; i < rbio->nr_pages; i++) { | ||
| 977 | if (rbio->stripe_pages[i]) | ||
| 978 | continue; | ||
| 979 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
| 980 | if (!page) | ||
| 981 | return -ENOMEM; | ||
| 982 | rbio->stripe_pages[i] = page; | ||
| 983 | ClearPageUptodate(page); | ||
| 984 | } | ||
| 985 | return 0; | ||
| 986 | } | ||
| 987 | |||
| 988 | /* allocate pages for just the p/q stripes */ | ||
| 989 | static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) | ||
| 990 | { | ||
| 991 | int i; | ||
| 992 | struct page *page; | ||
| 993 | |||
| 994 | i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; | ||
| 995 | |||
| 996 | for (; i < rbio->nr_pages; i++) { | ||
| 997 | if (rbio->stripe_pages[i]) | ||
| 998 | continue; | ||
| 999 | page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); | ||
| 1000 | if (!page) | ||
| 1001 | return -ENOMEM; | ||
| 1002 | rbio->stripe_pages[i] = page; | ||
| 1003 | } | ||
| 1004 | return 0; | ||
| 1005 | } | ||
| 1006 | |||
| 1007 | /* | ||
| 1008 | * add a single page from a specific stripe into our list of bios for IO | ||
| 1009 | * this will try to merge into existing bios if possible, and returns | ||
| 1010 | * zero if all went well. | ||
| 1011 | */ | ||
| 1012 | int rbio_add_io_page(struct btrfs_raid_bio *rbio, | ||
| 1013 | struct bio_list *bio_list, | ||
| 1014 | struct page *page, | ||
| 1015 | int stripe_nr, | ||
| 1016 | unsigned long page_index, | ||
| 1017 | unsigned long bio_max_len) | ||
| 1018 | { | ||
| 1019 | struct bio *last = bio_list->tail; | ||
| 1020 | u64 last_end = 0; | ||
| 1021 | int ret; | ||
| 1022 | struct bio *bio; | ||
| 1023 | struct btrfs_bio_stripe *stripe; | ||
| 1024 | u64 disk_start; | ||
| 1025 | |||
| 1026 | stripe = &rbio->bbio->stripes[stripe_nr]; | ||
| 1027 | disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT); | ||
| 1028 | |||
| 1029 | /* if the device is missing, just fail this stripe */ | ||
| 1030 | if (!stripe->dev->bdev) | ||
| 1031 | return fail_rbio_index(rbio, stripe_nr); | ||
| 1032 | |||
| 1033 | /* see if we can add this page onto our existing bio */ | ||
| 1034 | if (last) { | ||
| 1035 | last_end = (u64)last->bi_sector << 9; | ||
| 1036 | last_end += last->bi_size; | ||
| 1037 | |||
| 1038 | /* | ||
| 1039 | * we can't merge these if they are from different | ||
| 1040 | * devices or if they are not contiguous | ||
| 1041 | */ | ||
| 1042 | if (last_end == disk_start && stripe->dev->bdev && | ||
| 1043 | test_bit(BIO_UPTODATE, &last->bi_flags) && | ||
| 1044 | last->bi_bdev == stripe->dev->bdev) { | ||
| 1045 | ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0); | ||
| 1046 | if (ret == PAGE_CACHE_SIZE) | ||
| 1047 | return 0; | ||
| 1048 | } | ||
| 1049 | } | ||
| 1050 | |||
| 1051 | /* put a new bio on the list */ | ||
| 1052 | bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1); | ||
| 1053 | if (!bio) | ||
| 1054 | return -ENOMEM; | ||
| 1055 | |||
| 1056 | bio->bi_size = 0; | ||
| 1057 | bio->bi_bdev = stripe->dev->bdev; | ||
| 1058 | bio->bi_sector = disk_start >> 9; | ||
| 1059 | set_bit(BIO_UPTODATE, &bio->bi_flags); | ||
| 1060 | |||
| 1061 | bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); | ||
| 1062 | bio_list_add(bio_list, bio); | ||
| 1063 | return 0; | ||
| 1064 | } | ||
| 1065 | |||
| 1066 | /* | ||
| 1067 | * while we're doing the read/modify/write cycle, we could | ||
| 1068 | * have errors in reading pages off the disk. This checks | ||
| 1069 | * for errors and if we're not able to read the page it'll | ||
| 1070 | * trigger parity reconstruction. The rmw will be finished | ||
| 1071 | * after we've reconstructed the failed stripes | ||
| 1072 | */ | ||
| 1073 | static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) | ||
| 1074 | { | ||
| 1075 | if (rbio->faila >= 0 || rbio->failb >= 0) { | ||
| 1076 | BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1); | ||
| 1077 | __raid56_parity_recover(rbio); | ||
| 1078 | } else { | ||
| 1079 | finish_rmw(rbio); | ||
| 1080 | } | ||
| 1081 | } | ||
| 1082 | |||
| 1083 | /* | ||
| 1084 | * these are just the pages from the rbio array, not from anything | ||
| 1085 | * the FS sent down to us | ||
| 1086 | */ | ||
| 1087 | static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) | ||
| 1088 | { | ||
| 1089 | int index; | ||
| 1090 | index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); | ||
| 1091 | index += page; | ||
| 1092 | return rbio->stripe_pages[index]; | ||
| 1093 | } | ||
| 1094 | |||
| 1095 | /* | ||
| 1096 | * helper function to walk our bio list and populate the bio_pages array with | ||
| 1097 | * the result. This seems expensive, but it is faster than constantly | ||
| 1098 | * searching through the bio list as we setup the IO in finish_rmw or stripe | ||
| 1099 | * reconstruction. | ||
| 1100 | * | ||
| 1101 | * This must be called before you trust the answers from page_in_rbio | ||
| 1102 | */ | ||
| 1103 | static void index_rbio_pages(struct btrfs_raid_bio *rbio) | ||
| 1104 | { | ||
| 1105 | struct bio *bio; | ||
| 1106 | u64 start; | ||
| 1107 | unsigned long stripe_offset; | ||
| 1108 | unsigned long page_index; | ||
| 1109 | struct page *p; | ||
| 1110 | int i; | ||
| 1111 | |||
| 1112 | spin_lock_irq(&rbio->bio_list_lock); | ||
| 1113 | bio_list_for_each(bio, &rbio->bio_list) { | ||
| 1114 | start = (u64)bio->bi_sector << 9; | ||
| 1115 | stripe_offset = start - rbio->raid_map[0]; | ||
| 1116 | page_index = stripe_offset >> PAGE_CACHE_SHIFT; | ||
| 1117 | |||
| 1118 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
| 1119 | p = bio->bi_io_vec[i].bv_page; | ||
| 1120 | rbio->bio_pages[page_index + i] = p; | ||
| 1121 | } | ||
| 1122 | } | ||
| 1123 | spin_unlock_irq(&rbio->bio_list_lock); | ||
| 1124 | } | ||
| 1125 | |||
| 1126 | /* | ||
| 1127 | * this is called from one of two situations. We either | ||
| 1128 | * have a full stripe from the higher layers, or we've read all | ||
| 1129 | * the missing bits off disk. | ||
| 1130 | * | ||
| 1131 | * This will calculate the parity and then send down any | ||
| 1132 | * changed blocks. | ||
| 1133 | */ | ||
| 1134 | static noinline void finish_rmw(struct btrfs_raid_bio *rbio) | ||
| 1135 | { | ||
| 1136 | struct btrfs_bio *bbio = rbio->bbio; | ||
| 1137 | void *pointers[bbio->num_stripes]; | ||
| 1138 | int stripe_len = rbio->stripe_len; | ||
| 1139 | int nr_data = rbio->nr_data; | ||
| 1140 | int stripe; | ||
| 1141 | int pagenr; | ||
| 1142 | int p_stripe = -1; | ||
| 1143 | int q_stripe = -1; | ||
| 1144 | struct bio_list bio_list; | ||
| 1145 | struct bio *bio; | ||
| 1146 | int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; | ||
| 1147 | int ret; | ||
| 1148 | |||
| 1149 | bio_list_init(&bio_list); | ||
| 1150 | |||
| 1151 | if (bbio->num_stripes - rbio->nr_data == 1) { | ||
| 1152 | p_stripe = bbio->num_stripes - 1; | ||
| 1153 | } else if (bbio->num_stripes - rbio->nr_data == 2) { | ||
| 1154 | p_stripe = bbio->num_stripes - 2; | ||
| 1155 | q_stripe = bbio->num_stripes - 1; | ||
| 1156 | } else { | ||
| 1157 | BUG(); | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | /* at this point we either have a full stripe, | ||
| 1161 | * or we've read the full stripe from the drive. | ||
| 1162 | * recalculate the parity and write the new results. | ||
| 1163 | * | ||
| 1164 | * We're not allowed to add any new bios to the | ||
| 1165 | * bio list here, anyone else that wants to | ||
| 1166 | * change this stripe needs to do their own rmw. | ||
| 1167 | */ | ||
| 1168 | spin_lock_irq(&rbio->bio_list_lock); | ||
| 1169 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
| 1170 | spin_unlock_irq(&rbio->bio_list_lock); | ||
| 1171 | |||
| 1172 | atomic_set(&rbio->bbio->error, 0); | ||
| 1173 | |||
| 1174 | /* | ||
| 1175 | * now that we've set rmw_locked, run through the | ||
| 1176 | * bio list one last time and map the page pointers | ||
| 1177 | * | ||
| 1178 | * We don't cache full rbios because we're assuming | ||
| 1179 | * the higher layers are unlikely to use this area of | ||
| 1180 | * the disk again soon. If they do use it again, | ||
| 1181 | * hopefully they will send another full bio. | ||
| 1182 | */ | ||
| 1183 | index_rbio_pages(rbio); | ||
| 1184 | if (!rbio_is_full(rbio)) | ||
| 1185 | cache_rbio_pages(rbio); | ||
| 1186 | else | ||
| 1187 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
| 1188 | |||
| 1189 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
| 1190 | struct page *p; | ||
| 1191 | /* first collect one page from each data stripe */ | ||
| 1192 | for (stripe = 0; stripe < nr_data; stripe++) { | ||
| 1193 | p = page_in_rbio(rbio, stripe, pagenr, 0); | ||
| 1194 | pointers[stripe] = kmap(p); | ||
| 1195 | } | ||
| 1196 | |||
| 1197 | /* then add the parity stripe */ | ||
| 1198 | p = rbio_pstripe_page(rbio, pagenr); | ||
| 1199 | SetPageUptodate(p); | ||
| 1200 | pointers[stripe++] = kmap(p); | ||
| 1201 | |||
| 1202 | if (q_stripe != -1) { | ||
| 1203 | |||
| 1204 | /* | ||
| 1205 | * raid6, add the qstripe and call the | ||
| 1206 | * library function to fill in our p/q | ||
| 1207 | */ | ||
| 1208 | p = rbio_qstripe_page(rbio, pagenr); | ||
| 1209 | SetPageUptodate(p); | ||
| 1210 | pointers[stripe++] = kmap(p); | ||
| 1211 | |||
| 1212 | raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE, | ||
| 1213 | pointers); | ||
| 1214 | } else { | ||
| 1215 | /* raid5 */ | ||
| 1216 | memcpy(pointers[nr_data], pointers[0], PAGE_SIZE); | ||
| 1217 | run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE); | ||
| 1218 | } | ||
| 1219 | |||
| 1220 | |||
| 1221 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) | ||
| 1222 | kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); | ||
| 1223 | } | ||
| 1224 | |||
| 1225 | /* | ||
| 1226 | * time to start writing. Make bios for everything from the | ||
| 1227 | * higher layers (the bio_list in our rbio) and our p/q. Ignore | ||
| 1228 | * everything else. | ||
| 1229 | */ | ||
| 1230 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | ||
| 1231 | for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { | ||
| 1232 | struct page *page; | ||
| 1233 | if (stripe < rbio->nr_data) { | ||
| 1234 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
| 1235 | if (!page) | ||
| 1236 | continue; | ||
| 1237 | } else { | ||
| 1238 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
| 1239 | } | ||
| 1240 | |||
| 1241 | ret = rbio_add_io_page(rbio, &bio_list, | ||
| 1242 | page, stripe, pagenr, rbio->stripe_len); | ||
| 1243 | if (ret) | ||
| 1244 | goto cleanup; | ||
| 1245 | } | ||
| 1246 | } | ||
| 1247 | |||
| 1248 | atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list)); | ||
| 1249 | BUG_ON(atomic_read(&bbio->stripes_pending) == 0); | ||
| 1250 | |||
| 1251 | while (1) { | ||
| 1252 | bio = bio_list_pop(&bio_list); | ||
| 1253 | if (!bio) | ||
| 1254 | break; | ||
| 1255 | |||
| 1256 | bio->bi_private = rbio; | ||
| 1257 | bio->bi_end_io = raid_write_end_io; | ||
| 1258 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
| 1259 | submit_bio(WRITE, bio); | ||
| 1260 | } | ||
| 1261 | return; | ||
| 1262 | |||
| 1263 | cleanup: | ||
| 1264 | rbio_orig_end_io(rbio, -EIO, 0); | ||
| 1265 | } | ||
| 1266 | |||
| 1267 | /* | ||
| 1268 | * helper to find the stripe number for a given bio. Used to figure out which | ||
| 1269 | * stripe has failed. This expects the bio to correspond to a physical disk, | ||
| 1270 | * so it looks up based on physical sector numbers. | ||
| 1271 | */ | ||
| 1272 | static int find_bio_stripe(struct btrfs_raid_bio *rbio, | ||
| 1273 | struct bio *bio) | ||
| 1274 | { | ||
| 1275 | u64 physical = bio->bi_sector; | ||
| 1276 | u64 stripe_start; | ||
| 1277 | int i; | ||
| 1278 | struct btrfs_bio_stripe *stripe; | ||
| 1279 | |||
| 1280 | physical <<= 9; | ||
| 1281 | |||
| 1282 | for (i = 0; i < rbio->bbio->num_stripes; i++) { | ||
| 1283 | stripe = &rbio->bbio->stripes[i]; | ||
| 1284 | stripe_start = stripe->physical; | ||
| 1285 | if (physical >= stripe_start && | ||
| 1286 | physical < stripe_start + rbio->stripe_len) { | ||
| 1287 | return i; | ||
| 1288 | } | ||
| 1289 | } | ||
| 1290 | return -1; | ||
| 1291 | } | ||
| 1292 | |||
| 1293 | /* | ||
| 1294 | * helper to find the stripe number for a given | ||
| 1295 | * bio (before mapping). Used to figure out which stripe has | ||
| 1296 | * failed. This looks up based on logical block numbers. | ||
| 1297 | */ | ||
| 1298 | static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, | ||
| 1299 | struct bio *bio) | ||
| 1300 | { | ||
| 1301 | u64 logical = bio->bi_sector; | ||
| 1302 | u64 stripe_start; | ||
| 1303 | int i; | ||
| 1304 | |||
| 1305 | logical <<= 9; | ||
| 1306 | |||
| 1307 | for (i = 0; i < rbio->nr_data; i++) { | ||
| 1308 | stripe_start = rbio->raid_map[i]; | ||
| 1309 | if (logical >= stripe_start && | ||
| 1310 | logical < stripe_start + rbio->stripe_len) { | ||
| 1311 | return i; | ||
| 1312 | } | ||
| 1313 | } | ||
| 1314 | return -1; | ||
| 1315 | } | ||
| 1316 | |||
| 1317 | /* | ||
| 1318 | * returns -EIO if we had too many failures | ||
| 1319 | */ | ||
| 1320 | static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) | ||
| 1321 | { | ||
| 1322 | unsigned long flags; | ||
| 1323 | int ret = 0; | ||
| 1324 | |||
| 1325 | spin_lock_irqsave(&rbio->bio_list_lock, flags); | ||
| 1326 | |||
| 1327 | /* we already know this stripe is bad, move on */ | ||
| 1328 | if (rbio->faila == failed || rbio->failb == failed) | ||
| 1329 | goto out; | ||
| 1330 | |||
| 1331 | if (rbio->faila == -1) { | ||
| 1332 | /* first failure on this rbio */ | ||
| 1333 | rbio->faila = failed; | ||
| 1334 | atomic_inc(&rbio->bbio->error); | ||
| 1335 | } else if (rbio->failb == -1) { | ||
| 1336 | /* second failure on this rbio */ | ||
| 1337 | rbio->failb = failed; | ||
| 1338 | atomic_inc(&rbio->bbio->error); | ||
| 1339 | } else { | ||
| 1340 | ret = -EIO; | ||
| 1341 | } | ||
| 1342 | out: | ||
| 1343 | spin_unlock_irqrestore(&rbio->bio_list_lock, flags); | ||
| 1344 | |||
| 1345 | return ret; | ||
| 1346 | } | ||
| 1347 | |||
| 1348 | /* | ||
| 1349 | * helper to fail a stripe based on a physical disk | ||
| 1350 | * bio. | ||
| 1351 | */ | ||
| 1352 | static int fail_bio_stripe(struct btrfs_raid_bio *rbio, | ||
| 1353 | struct bio *bio) | ||
| 1354 | { | ||
| 1355 | int failed = find_bio_stripe(rbio, bio); | ||
| 1356 | |||
| 1357 | if (failed < 0) | ||
| 1358 | return -EIO; | ||
| 1359 | |||
| 1360 | return fail_rbio_index(rbio, failed); | ||
| 1361 | } | ||
| 1362 | |||
| 1363 | /* | ||
| 1364 | * this sets each page in the bio uptodate. It should only be used on private | ||
| 1365 | * rbio pages, nothing that comes in from the higher layers | ||
| 1366 | */ | ||
| 1367 | static void set_bio_pages_uptodate(struct bio *bio) | ||
| 1368 | { | ||
| 1369 | int i; | ||
| 1370 | struct page *p; | ||
| 1371 | |||
| 1372 | for (i = 0; i < bio->bi_vcnt; i++) { | ||
| 1373 | p = bio->bi_io_vec[i].bv_page; | ||
| 1374 | SetPageUptodate(p); | ||
| 1375 | } | ||
| 1376 | } | ||
| 1377 | |||
| 1378 | /* | ||
| 1379 | * end io for the read phase of the rmw cycle. All the bios here are physical | ||
| 1380 | * stripe bios we've read from the disk so we can recalculate the parity of the | ||
| 1381 | * stripe. | ||
| 1382 | * | ||
| 1383 | * This will usually kick off finish_rmw once all the bios are read in, but it | ||
| 1384 | * may trigger parity reconstruction if we had any errors along the way | ||
| 1385 | */ | ||
| 1386 | static void raid_rmw_end_io(struct bio *bio, int err) | ||
| 1387 | { | ||
| 1388 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
| 1389 | |||
| 1390 | if (err) | ||
| 1391 | fail_bio_stripe(rbio, bio); | ||
| 1392 | else | ||
| 1393 | set_bio_pages_uptodate(bio); | ||
| 1394 | |||
| 1395 | bio_put(bio); | ||
| 1396 | |||
| 1397 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
| 1398 | return; | ||
| 1399 | |||
| 1400 | err = 0; | ||
| 1401 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
| 1402 | goto cleanup; | ||
| 1403 | |||
| 1404 | /* | ||
| 1405 | * this will normally call finish_rmw to start our write | ||
| 1406 | * but if there are any failed stripes we'll reconstruct | ||
| 1407 | * from parity first | ||
| 1408 | */ | ||
| 1409 | validate_rbio_for_rmw(rbio); | ||
| 1410 | return; | ||
| 1411 | |||
| 1412 | cleanup: | ||
| 1413 | |||
| 1414 | rbio_orig_end_io(rbio, -EIO, 0); | ||
| 1415 | } | ||
| 1416 | |||
| 1417 | static void async_rmw_stripe(struct btrfs_raid_bio *rbio) | ||
| 1418 | { | ||
| 1419 | rbio->work.flags = 0; | ||
| 1420 | rbio->work.func = rmw_work; | ||
| 1421 | |||
| 1422 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | ||
| 1423 | &rbio->work); | ||
| 1424 | } | ||
| 1425 | |||
| 1426 | static void async_read_rebuild(struct btrfs_raid_bio *rbio) | ||
| 1427 | { | ||
| 1428 | rbio->work.flags = 0; | ||
| 1429 | rbio->work.func = read_rebuild_work; | ||
| 1430 | |||
| 1431 | btrfs_queue_worker(&rbio->fs_info->rmw_workers, | ||
| 1432 | &rbio->work); | ||
| 1433 | } | ||
| 1434 | |||
| 1435 | /* | ||
| 1436 | * the stripe must be locked by the caller. It will | ||
| 1437 | * unlock after all the writes are done | ||
| 1438 | */ | ||
| 1439 | static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) | ||
| 1440 | { | ||
| 1441 | int bios_to_read = 0; | ||
| 1442 | struct btrfs_bio *bbio = rbio->bbio; | ||
| 1443 | struct bio_list bio_list; | ||
| 1444 | int ret; | ||
| 1445 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 1446 | int pagenr; | ||
| 1447 | int stripe; | ||
| 1448 | struct bio *bio; | ||
| 1449 | |||
| 1450 | bio_list_init(&bio_list); | ||
| 1451 | |||
| 1452 | ret = alloc_rbio_pages(rbio); | ||
| 1453 | if (ret) | ||
| 1454 | goto cleanup; | ||
| 1455 | |||
| 1456 | index_rbio_pages(rbio); | ||
| 1457 | |||
| 1458 | atomic_set(&rbio->bbio->error, 0); | ||
| 1459 | /* | ||
| 1460 | * build a list of bios to read all the missing parts of this | ||
| 1461 | * stripe | ||
| 1462 | */ | ||
| 1463 | for (stripe = 0; stripe < rbio->nr_data; stripe++) { | ||
| 1464 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
| 1465 | struct page *page; | ||
| 1466 | /* | ||
| 1467 | * we want to find all the pages missing from | ||
| 1468 | * the rbio and read them from the disk. If | ||
| 1469 | * page_in_rbio finds a page in the bio list | ||
| 1470 | * we don't need to read it off the stripe. | ||
| 1471 | */ | ||
| 1472 | page = page_in_rbio(rbio, stripe, pagenr, 1); | ||
| 1473 | if (page) | ||
| 1474 | continue; | ||
| 1475 | |||
| 1476 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
| 1477 | /* | ||
| 1478 | * the bio cache may have handed us an uptodate | ||
| 1479 | * page. If so, be happy and use it | ||
| 1480 | */ | ||
| 1481 | if (PageUptodate(page)) | ||
| 1482 | continue; | ||
| 1483 | |||
| 1484 | ret = rbio_add_io_page(rbio, &bio_list, page, | ||
| 1485 | stripe, pagenr, rbio->stripe_len); | ||
| 1486 | if (ret) | ||
| 1487 | goto cleanup; | ||
| 1488 | } | ||
| 1489 | } | ||
| 1490 | |||
| 1491 | bios_to_read = bio_list_size(&bio_list); | ||
| 1492 | if (!bios_to_read) { | ||
| 1493 | /* | ||
| 1494 | * this can happen if others have merged with | ||
| 1495 | * us, it means there is nothing left to read. | ||
| 1496 | * But if there are missing devices it may not be | ||
| 1497 | * safe to do the full stripe write yet. | ||
| 1498 | */ | ||
| 1499 | goto finish; | ||
| 1500 | } | ||
| 1501 | |||
| 1502 | /* | ||
| 1503 | * the bbio may be freed once we submit the last bio. Make sure | ||
| 1504 | * not to touch it after that | ||
| 1505 | */ | ||
| 1506 | atomic_set(&bbio->stripes_pending, bios_to_read); | ||
| 1507 | while (1) { | ||
| 1508 | bio = bio_list_pop(&bio_list); | ||
| 1509 | if (!bio) | ||
| 1510 | break; | ||
| 1511 | |||
| 1512 | bio->bi_private = rbio; | ||
| 1513 | bio->bi_end_io = raid_rmw_end_io; | ||
| 1514 | |||
| 1515 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
| 1516 | BTRFS_WQ_ENDIO_RAID56); | ||
| 1517 | |||
| 1518 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
| 1519 | submit_bio(READ, bio); | ||
| 1520 | } | ||
| 1521 | /* the actual write will happen once the reads are done */ | ||
| 1522 | return 0; | ||
| 1523 | |||
| 1524 | cleanup: | ||
| 1525 | rbio_orig_end_io(rbio, -EIO, 0); | ||
| 1526 | return -EIO; | ||
| 1527 | |||
| 1528 | finish: | ||
| 1529 | validate_rbio_for_rmw(rbio); | ||
| 1530 | return 0; | ||
| 1531 | } | ||
| 1532 | |||
| 1533 | /* | ||
| 1534 | * if the upper layers pass in a full stripe, we thank them by only allocating | ||
| 1535 | * enough pages to hold the parity, and sending it all down quickly. | ||
| 1536 | */ | ||
| 1537 | static int full_stripe_write(struct btrfs_raid_bio *rbio) | ||
| 1538 | { | ||
| 1539 | int ret; | ||
| 1540 | |||
| 1541 | ret = alloc_rbio_parity_pages(rbio); | ||
| 1542 | if (ret) | ||
| 1543 | return ret; | ||
| 1544 | |||
| 1545 | ret = lock_stripe_add(rbio); | ||
| 1546 | if (ret == 0) | ||
| 1547 | finish_rmw(rbio); | ||
| 1548 | return 0; | ||
| 1549 | } | ||
| 1550 | |||
| 1551 | /* | ||
| 1552 | * partial stripe writes get handed over to async helpers. | ||
| 1553 | * We're really hoping to merge a few more writes into this | ||
| 1554 | * rbio before calculating new parity | ||
| 1555 | */ | ||
| 1556 | static int partial_stripe_write(struct btrfs_raid_bio *rbio) | ||
| 1557 | { | ||
| 1558 | int ret; | ||
| 1559 | |||
| 1560 | ret = lock_stripe_add(rbio); | ||
| 1561 | if (ret == 0) | ||
| 1562 | async_rmw_stripe(rbio); | ||
| 1563 | return 0; | ||
| 1564 | } | ||
| 1565 | |||
| 1566 | /* | ||
| 1567 | * sometimes while we were reading from the drive to | ||
| 1568 | * recalculate parity, enough new bios come into create | ||
| 1569 | * a full stripe. So we do a check here to see if we can | ||
| 1570 | * go directly to finish_rmw | ||
| 1571 | */ | ||
| 1572 | static int __raid56_parity_write(struct btrfs_raid_bio *rbio) | ||
| 1573 | { | ||
| 1574 | /* head off into rmw land if we don't have a full stripe */ | ||
| 1575 | if (!rbio_is_full(rbio)) | ||
| 1576 | return partial_stripe_write(rbio); | ||
| 1577 | return full_stripe_write(rbio); | ||
| 1578 | } | ||
| 1579 | |||
| 1580 | /* | ||
| 1581 | * We use plugging call backs to collect full stripes. | ||
| 1582 | * Any time we get a partial stripe write while plugged | ||
| 1583 | * we collect it into a list. When the unplug comes down, | ||
| 1584 | * we sort the list by logical block number and merge | ||
| 1585 | * everything we can into the same rbios | ||
| 1586 | */ | ||
| 1587 | struct btrfs_plug_cb { | ||
| 1588 | struct blk_plug_cb cb; | ||
| 1589 | struct btrfs_fs_info *info; | ||
| 1590 | struct list_head rbio_list; | ||
| 1591 | struct btrfs_work work; | ||
| 1592 | }; | ||
| 1593 | |||
| 1594 | /* | ||
| 1595 | * rbios on the plug list are sorted for easier merging. | ||
| 1596 | */ | ||
| 1597 | static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) | ||
| 1598 | { | ||
| 1599 | struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, | ||
| 1600 | plug_list); | ||
| 1601 | struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio, | ||
| 1602 | plug_list); | ||
| 1603 | u64 a_sector = ra->bio_list.head->bi_sector; | ||
| 1604 | u64 b_sector = rb->bio_list.head->bi_sector; | ||
| 1605 | |||
| 1606 | if (a_sector < b_sector) | ||
| 1607 | return -1; | ||
| 1608 | if (a_sector > b_sector) | ||
| 1609 | return 1; | ||
| 1610 | return 0; | ||
| 1611 | } | ||
| 1612 | |||
| 1613 | static void run_plug(struct btrfs_plug_cb *plug) | ||
| 1614 | { | ||
| 1615 | struct btrfs_raid_bio *cur; | ||
| 1616 | struct btrfs_raid_bio *last = NULL; | ||
| 1617 | |||
| 1618 | /* | ||
| 1619 | * sort our plug list then try to merge | ||
| 1620 | * everything we can in hopes of creating full | ||
| 1621 | * stripes. | ||
| 1622 | */ | ||
| 1623 | list_sort(NULL, &plug->rbio_list, plug_cmp); | ||
| 1624 | while (!list_empty(&plug->rbio_list)) { | ||
| 1625 | cur = list_entry(plug->rbio_list.next, | ||
| 1626 | struct btrfs_raid_bio, plug_list); | ||
| 1627 | list_del_init(&cur->plug_list); | ||
| 1628 | |||
| 1629 | if (rbio_is_full(cur)) { | ||
| 1630 | /* we have a full stripe, send it down */ | ||
| 1631 | full_stripe_write(cur); | ||
| 1632 | continue; | ||
| 1633 | } | ||
| 1634 | if (last) { | ||
| 1635 | if (rbio_can_merge(last, cur)) { | ||
| 1636 | merge_rbio(last, cur); | ||
| 1637 | __free_raid_bio(cur); | ||
| 1638 | continue; | ||
| 1639 | |||
| 1640 | } | ||
| 1641 | __raid56_parity_write(last); | ||
| 1642 | } | ||
| 1643 | last = cur; | ||
| 1644 | } | ||
| 1645 | if (last) { | ||
| 1646 | __raid56_parity_write(last); | ||
| 1647 | } | ||
| 1648 | kfree(plug); | ||
| 1649 | } | ||
| 1650 | |||
| 1651 | /* | ||
| 1652 | * if the unplug comes from schedule, we have to push the | ||
| 1653 | * work off to a helper thread | ||
| 1654 | */ | ||
| 1655 | static void unplug_work(struct btrfs_work *work) | ||
| 1656 | { | ||
| 1657 | struct btrfs_plug_cb *plug; | ||
| 1658 | plug = container_of(work, struct btrfs_plug_cb, work); | ||
| 1659 | run_plug(plug); | ||
| 1660 | } | ||
| 1661 | |||
| 1662 | static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) | ||
| 1663 | { | ||
| 1664 | struct btrfs_plug_cb *plug; | ||
| 1665 | plug = container_of(cb, struct btrfs_plug_cb, cb); | ||
| 1666 | |||
| 1667 | if (from_schedule) { | ||
| 1668 | plug->work.flags = 0; | ||
| 1669 | plug->work.func = unplug_work; | ||
| 1670 | btrfs_queue_worker(&plug->info->rmw_workers, | ||
| 1671 | &plug->work); | ||
| 1672 | return; | ||
| 1673 | } | ||
| 1674 | run_plug(plug); | ||
| 1675 | } | ||
| 1676 | |||
| 1677 | /* | ||
| 1678 | * our main entry point for writes from the rest of the FS. | ||
| 1679 | */ | ||
| 1680 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | ||
| 1681 | struct btrfs_bio *bbio, u64 *raid_map, | ||
| 1682 | u64 stripe_len) | ||
| 1683 | { | ||
| 1684 | struct btrfs_raid_bio *rbio; | ||
| 1685 | struct btrfs_plug_cb *plug = NULL; | ||
| 1686 | struct blk_plug_cb *cb; | ||
| 1687 | |||
| 1688 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
| 1689 | if (IS_ERR(rbio)) { | ||
| 1690 | kfree(raid_map); | ||
| 1691 | kfree(bbio); | ||
| 1692 | return PTR_ERR(rbio); | ||
| 1693 | } | ||
| 1694 | bio_list_add(&rbio->bio_list, bio); | ||
| 1695 | rbio->bio_list_bytes = bio->bi_size; | ||
| 1696 | |||
| 1697 | /* | ||
| 1698 | * don't plug on full rbios, just get them out the door | ||
| 1699 | * as quickly as we can | ||
| 1700 | */ | ||
| 1701 | if (rbio_is_full(rbio)) | ||
| 1702 | return full_stripe_write(rbio); | ||
| 1703 | |||
| 1704 | cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info, | ||
| 1705 | sizeof(*plug)); | ||
| 1706 | if (cb) { | ||
| 1707 | plug = container_of(cb, struct btrfs_plug_cb, cb); | ||
| 1708 | if (!plug->info) { | ||
| 1709 | plug->info = root->fs_info; | ||
| 1710 | INIT_LIST_HEAD(&plug->rbio_list); | ||
| 1711 | } | ||
| 1712 | list_add_tail(&rbio->plug_list, &plug->rbio_list); | ||
| 1713 | } else { | ||
| 1714 | return __raid56_parity_write(rbio); | ||
| 1715 | } | ||
| 1716 | return 0; | ||
| 1717 | } | ||
| 1718 | |||
| 1719 | /* | ||
| 1720 | * all parity reconstruction happens here. We've read in everything | ||
| 1721 | * we can find from the drives and this does the heavy lifting of | ||
| 1722 | * sorting the good from the bad. | ||
| 1723 | */ | ||
| 1724 | static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) | ||
| 1725 | { | ||
| 1726 | int pagenr, stripe; | ||
| 1727 | void **pointers; | ||
| 1728 | int faila = -1, failb = -1; | ||
| 1729 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 1730 | struct page *page; | ||
| 1731 | int err; | ||
| 1732 | int i; | ||
| 1733 | |||
| 1734 | pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *), | ||
| 1735 | GFP_NOFS); | ||
| 1736 | if (!pointers) { | ||
| 1737 | err = -ENOMEM; | ||
| 1738 | goto cleanup_io; | ||
| 1739 | } | ||
| 1740 | |||
| 1741 | faila = rbio->faila; | ||
| 1742 | failb = rbio->failb; | ||
| 1743 | |||
| 1744 | if (rbio->read_rebuild) { | ||
| 1745 | spin_lock_irq(&rbio->bio_list_lock); | ||
| 1746 | set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); | ||
| 1747 | spin_unlock_irq(&rbio->bio_list_lock); | ||
| 1748 | } | ||
| 1749 | |||
| 1750 | index_rbio_pages(rbio); | ||
| 1751 | |||
| 1752 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
| 1753 | /* setup our array of pointers with pages | ||
| 1754 | * from each stripe | ||
| 1755 | */ | ||
| 1756 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | ||
| 1757 | /* | ||
| 1758 | * if we're rebuilding a read, we have to use | ||
| 1759 | * pages from the bio list | ||
| 1760 | */ | ||
| 1761 | if (rbio->read_rebuild && | ||
| 1762 | (stripe == faila || stripe == failb)) { | ||
| 1763 | page = page_in_rbio(rbio, stripe, pagenr, 0); | ||
| 1764 | } else { | ||
| 1765 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
| 1766 | } | ||
| 1767 | pointers[stripe] = kmap(page); | ||
| 1768 | } | ||
| 1769 | |||
| 1770 | /* all raid6 handling here */ | ||
| 1771 | if (rbio->raid_map[rbio->bbio->num_stripes - 1] == | ||
| 1772 | RAID6_Q_STRIPE) { | ||
| 1773 | |||
| 1774 | /* | ||
| 1775 | * single failure, rebuild from parity raid5 | ||
| 1776 | * style | ||
| 1777 | */ | ||
| 1778 | if (failb < 0) { | ||
| 1779 | if (faila == rbio->nr_data) { | ||
| 1780 | /* | ||
| 1781 | * Just the P stripe has failed, without | ||
| 1782 | * a bad data or Q stripe. | ||
| 1783 | * TODO, we should redo the xor here. | ||
| 1784 | */ | ||
| 1785 | err = -EIO; | ||
| 1786 | goto cleanup; | ||
| 1787 | } | ||
| 1788 | /* | ||
| 1789 | * a single failure in raid6 is rebuilt | ||
| 1790 | * in the pstripe code below | ||
| 1791 | */ | ||
| 1792 | goto pstripe; | ||
| 1793 | } | ||
| 1794 | |||
| 1795 | /* make sure our ps and qs are in order */ | ||
| 1796 | if (faila > failb) { | ||
| 1797 | int tmp = failb; | ||
| 1798 | failb = faila; | ||
| 1799 | faila = tmp; | ||
| 1800 | } | ||
| 1801 | |||
| 1802 | /* if the q stripe is failed, do a pstripe reconstruction | ||
| 1803 | * from the xors. | ||
| 1804 | * If both the q stripe and the P stripe are failed, we're | ||
| 1805 | * here due to a crc mismatch and we can't give them the | ||
| 1806 | * data they want | ||
| 1807 | */ | ||
| 1808 | if (rbio->raid_map[failb] == RAID6_Q_STRIPE) { | ||
| 1809 | if (rbio->raid_map[faila] == RAID5_P_STRIPE) { | ||
| 1810 | err = -EIO; | ||
| 1811 | goto cleanup; | ||
| 1812 | } | ||
| 1813 | /* | ||
| 1814 | * otherwise we have one bad data stripe and | ||
| 1815 | * a good P stripe. raid5! | ||
| 1816 | */ | ||
| 1817 | goto pstripe; | ||
| 1818 | } | ||
| 1819 | |||
| 1820 | if (rbio->raid_map[failb] == RAID5_P_STRIPE) { | ||
| 1821 | raid6_datap_recov(rbio->bbio->num_stripes, | ||
| 1822 | PAGE_SIZE, faila, pointers); | ||
| 1823 | } else { | ||
| 1824 | raid6_2data_recov(rbio->bbio->num_stripes, | ||
| 1825 | PAGE_SIZE, faila, failb, | ||
| 1826 | pointers); | ||
| 1827 | } | ||
| 1828 | } else { | ||
| 1829 | void *p; | ||
| 1830 | |||
| 1831 | /* rebuild from P stripe here (raid5 or raid6) */ | ||
| 1832 | BUG_ON(failb != -1); | ||
| 1833 | pstripe: | ||
| 1834 | /* Copy parity block into failed block to start with */ | ||
| 1835 | memcpy(pointers[faila], | ||
| 1836 | pointers[rbio->nr_data], | ||
| 1837 | PAGE_CACHE_SIZE); | ||
| 1838 | |||
| 1839 | /* rearrange the pointer array */ | ||
| 1840 | p = pointers[faila]; | ||
| 1841 | for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) | ||
| 1842 | pointers[stripe] = pointers[stripe + 1]; | ||
| 1843 | pointers[rbio->nr_data - 1] = p; | ||
| 1844 | |||
| 1845 | /* xor in the rest */ | ||
| 1846 | run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE); | ||
| 1847 | } | ||
| 1848 | /* if we're doing this rebuild as part of an rmw, go through | ||
| 1849 | * and set all of our private rbio pages in the | ||
| 1850 | * failed stripes as uptodate. This way finish_rmw will | ||
| 1851 | * know they can be trusted. If this was a read reconstruction, | ||
| 1852 | * other endio functions will fiddle the uptodate bits | ||
| 1853 | */ | ||
| 1854 | if (!rbio->read_rebuild) { | ||
| 1855 | for (i = 0; i < nr_pages; i++) { | ||
| 1856 | if (faila != -1) { | ||
| 1857 | page = rbio_stripe_page(rbio, faila, i); | ||
| 1858 | SetPageUptodate(page); | ||
| 1859 | } | ||
| 1860 | if (failb != -1) { | ||
| 1861 | page = rbio_stripe_page(rbio, failb, i); | ||
| 1862 | SetPageUptodate(page); | ||
| 1863 | } | ||
| 1864 | } | ||
| 1865 | } | ||
| 1866 | for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) { | ||
| 1867 | /* | ||
| 1868 | * if we're rebuilding a read, we have to use | ||
| 1869 | * pages from the bio list | ||
| 1870 | */ | ||
| 1871 | if (rbio->read_rebuild && | ||
| 1872 | (stripe == faila || stripe == failb)) { | ||
| 1873 | page = page_in_rbio(rbio, stripe, pagenr, 0); | ||
| 1874 | } else { | ||
| 1875 | page = rbio_stripe_page(rbio, stripe, pagenr); | ||
| 1876 | } | ||
| 1877 | kunmap(page); | ||
| 1878 | } | ||
| 1879 | } | ||
| 1880 | |||
| 1881 | err = 0; | ||
| 1882 | cleanup: | ||
| 1883 | kfree(pointers); | ||
| 1884 | |||
| 1885 | cleanup_io: | ||
| 1886 | |||
| 1887 | if (rbio->read_rebuild) { | ||
| 1888 | if (err == 0) | ||
| 1889 | cache_rbio_pages(rbio); | ||
| 1890 | else | ||
| 1891 | clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); | ||
| 1892 | |||
| 1893 | rbio_orig_end_io(rbio, err, err == 0); | ||
| 1894 | } else if (err == 0) { | ||
| 1895 | rbio->faila = -1; | ||
| 1896 | rbio->failb = -1; | ||
| 1897 | finish_rmw(rbio); | ||
| 1898 | } else { | ||
| 1899 | rbio_orig_end_io(rbio, err, 0); | ||
| 1900 | } | ||
| 1901 | } | ||
| 1902 | |||
| 1903 | /* | ||
| 1904 | * This is called only for stripes we've read from disk to | ||
| 1905 | * reconstruct the parity. | ||
| 1906 | */ | ||
| 1907 | static void raid_recover_end_io(struct bio *bio, int err) | ||
| 1908 | { | ||
| 1909 | struct btrfs_raid_bio *rbio = bio->bi_private; | ||
| 1910 | |||
| 1911 | /* | ||
| 1912 | * we only read stripe pages off the disk, set them | ||
| 1913 | * up to date if there were no errors | ||
| 1914 | */ | ||
| 1915 | if (err) | ||
| 1916 | fail_bio_stripe(rbio, bio); | ||
| 1917 | else | ||
| 1918 | set_bio_pages_uptodate(bio); | ||
| 1919 | bio_put(bio); | ||
| 1920 | |||
| 1921 | if (!atomic_dec_and_test(&rbio->bbio->stripes_pending)) | ||
| 1922 | return; | ||
| 1923 | |||
| 1924 | if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors) | ||
| 1925 | rbio_orig_end_io(rbio, -EIO, 0); | ||
| 1926 | else | ||
| 1927 | __raid_recover_end_io(rbio); | ||
| 1928 | } | ||
| 1929 | |||
| 1930 | /* | ||
| 1931 | * reads everything we need off the disk to reconstruct | ||
| 1932 | * the parity. endio handlers trigger final reconstruction | ||
| 1933 | * when the IO is done. | ||
| 1934 | * | ||
| 1935 | * This is used both for reads from the higher layers and for | ||
| 1936 | * parity construction required to finish a rmw cycle. | ||
| 1937 | */ | ||
| 1938 | static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) | ||
| 1939 | { | ||
| 1940 | int bios_to_read = 0; | ||
| 1941 | struct btrfs_bio *bbio = rbio->bbio; | ||
| 1942 | struct bio_list bio_list; | ||
| 1943 | int ret; | ||
| 1944 | int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | ||
| 1945 | int pagenr; | ||
| 1946 | int stripe; | ||
| 1947 | struct bio *bio; | ||
| 1948 | |||
| 1949 | bio_list_init(&bio_list); | ||
| 1950 | |||
| 1951 | ret = alloc_rbio_pages(rbio); | ||
| 1952 | if (ret) | ||
| 1953 | goto cleanup; | ||
| 1954 | |||
| 1955 | atomic_set(&rbio->bbio->error, 0); | ||
| 1956 | |||
| 1957 | /* | ||
| 1958 | * read everything that hasn't failed. Thanks to the | ||
| 1959 | * stripe cache, it is possible that some or all of these | ||
| 1960 | * pages are going to be uptodate. | ||
| 1961 | */ | ||
| 1962 | for (stripe = 0; stripe < bbio->num_stripes; stripe++) { | ||
| 1963 | if (rbio->faila == stripe || | ||
| 1964 | rbio->failb == stripe) | ||
| 1965 | continue; | ||
| 1966 | |||
| 1967 | for (pagenr = 0; pagenr < nr_pages; pagenr++) { | ||
| 1968 | struct page *p; | ||
| 1969 | |||
| 1970 | /* | ||
| 1971 | * the rmw code may have already read this | ||
| 1972 | * page in | ||
| 1973 | */ | ||
| 1974 | p = rbio_stripe_page(rbio, stripe, pagenr); | ||
| 1975 | if (PageUptodate(p)) | ||
| 1976 | continue; | ||
| 1977 | |||
| 1978 | ret = rbio_add_io_page(rbio, &bio_list, | ||
| 1979 | rbio_stripe_page(rbio, stripe, pagenr), | ||
| 1980 | stripe, pagenr, rbio->stripe_len); | ||
| 1981 | if (ret < 0) | ||
| 1982 | goto cleanup; | ||
| 1983 | } | ||
| 1984 | } | ||
| 1985 | |||
| 1986 | bios_to_read = bio_list_size(&bio_list); | ||
| 1987 | if (!bios_to_read) { | ||
| 1988 | /* | ||
| 1989 | * we might have no bios to read just because the pages | ||
| 1990 | * were up to date, or we might have no bios to read because | ||
| 1991 | * the devices were gone. | ||
| 1992 | */ | ||
| 1993 | if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) { | ||
| 1994 | __raid_recover_end_io(rbio); | ||
| 1995 | goto out; | ||
| 1996 | } else { | ||
| 1997 | goto cleanup; | ||
| 1998 | } | ||
| 1999 | } | ||
| 2000 | |||
| 2001 | /* | ||
| 2002 | * the bbio may be freed once we submit the last bio. Make sure | ||
| 2003 | * not to touch it after that | ||
| 2004 | */ | ||
| 2005 | atomic_set(&bbio->stripes_pending, bios_to_read); | ||
| 2006 | while (1) { | ||
| 2007 | bio = bio_list_pop(&bio_list); | ||
| 2008 | if (!bio) | ||
| 2009 | break; | ||
| 2010 | |||
| 2011 | bio->bi_private = rbio; | ||
| 2012 | bio->bi_end_io = raid_recover_end_io; | ||
| 2013 | |||
| 2014 | btrfs_bio_wq_end_io(rbio->fs_info, bio, | ||
| 2015 | BTRFS_WQ_ENDIO_RAID56); | ||
| 2016 | |||
| 2017 | BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags)); | ||
| 2018 | submit_bio(READ, bio); | ||
| 2019 | } | ||
| 2020 | out: | ||
| 2021 | return 0; | ||
| 2022 | |||
| 2023 | cleanup: | ||
| 2024 | if (rbio->read_rebuild) | ||
| 2025 | rbio_orig_end_io(rbio, -EIO, 0); | ||
| 2026 | return -EIO; | ||
| 2027 | } | ||
| 2028 | |||
| 2029 | /* | ||
| 2030 | * the main entry point for reads from the higher layers. This | ||
| 2031 | * is really only called when the normal read path had a failure, | ||
| 2032 | * so we assume the bio they send down corresponds to a failed part | ||
| 2033 | * of the drive. | ||
| 2034 | */ | ||
| 2035 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | ||
| 2036 | struct btrfs_bio *bbio, u64 *raid_map, | ||
| 2037 | u64 stripe_len, int mirror_num) | ||
| 2038 | { | ||
| 2039 | struct btrfs_raid_bio *rbio; | ||
| 2040 | int ret; | ||
| 2041 | |||
| 2042 | rbio = alloc_rbio(root, bbio, raid_map, stripe_len); | ||
| 2043 | if (IS_ERR(rbio)) { | ||
| 2044 | return PTR_ERR(rbio); | ||
| 2045 | } | ||
| 2046 | |||
| 2047 | rbio->read_rebuild = 1; | ||
| 2048 | bio_list_add(&rbio->bio_list, bio); | ||
| 2049 | rbio->bio_list_bytes = bio->bi_size; | ||
| 2050 | |||
| 2051 | rbio->faila = find_logical_bio_stripe(rbio, bio); | ||
| 2052 | if (rbio->faila == -1) { | ||
| 2053 | BUG(); | ||
| 2054 | kfree(rbio); | ||
| 2055 | return -EIO; | ||
| 2056 | } | ||
| 2057 | |||
| 2058 | /* | ||
| 2059 | * reconstruct from the q stripe if they are | ||
| 2060 | * asking for mirror 3 | ||
| 2061 | */ | ||
| 2062 | if (mirror_num == 3) | ||
| 2063 | rbio->failb = bbio->num_stripes - 2; | ||
| 2064 | |||
| 2065 | ret = lock_stripe_add(rbio); | ||
| 2066 | |||
| 2067 | /* | ||
| 2068 | * __raid56_parity_recover will end the bio with | ||
| 2069 | * any errors it hits. We don't want to return | ||
| 2070 | * its error value up the stack because our caller | ||
| 2071 | * will end up calling bio_endio with any nonzero | ||
| 2072 | * return | ||
| 2073 | */ | ||
| 2074 | if (ret == 0) | ||
| 2075 | __raid56_parity_recover(rbio); | ||
| 2076 | /* | ||
| 2077 | * our rbio has been added to the list of | ||
| 2078 | * rbios that will be handled after the | ||
| 2079 | * currently lock owner is done | ||
| 2080 | */ | ||
| 2081 | return 0; | ||
| 2082 | |||
| 2083 | } | ||
| 2084 | |||
| 2085 | static void rmw_work(struct btrfs_work *work) | ||
| 2086 | { | ||
| 2087 | struct btrfs_raid_bio *rbio; | ||
| 2088 | |||
| 2089 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
| 2090 | raid56_rmw_stripe(rbio); | ||
| 2091 | } | ||
| 2092 | |||
| 2093 | static void read_rebuild_work(struct btrfs_work *work) | ||
| 2094 | { | ||
| 2095 | struct btrfs_raid_bio *rbio; | ||
| 2096 | |||
| 2097 | rbio = container_of(work, struct btrfs_raid_bio, work); | ||
| 2098 | __raid56_parity_recover(rbio); | ||
| 2099 | } | ||
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h new file mode 100644 index 000000000000..ea5d73bfdfbe --- /dev/null +++ b/fs/btrfs/raid56.h | |||
| @@ -0,0 +1,51 @@ | |||
| 1 | /* | ||
| 2 | * Copyright (C) 2012 Fusion-io All rights reserved. | ||
| 3 | * Copyright (C) 2012 Intel Corp. All rights reserved. | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or | ||
| 6 | * modify it under the terms of the GNU General Public | ||
| 7 | * License v2 as published by the Free Software Foundation. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
| 12 | * General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public | ||
| 15 | * License along with this program; if not, write to the | ||
| 16 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | ||
| 17 | * Boston, MA 021110-1307, USA. | ||
| 18 | */ | ||
| 19 | |||
| 20 | #ifndef __BTRFS_RAID56__ | ||
| 21 | #define __BTRFS_RAID56__ | ||
| 22 | static inline int nr_parity_stripes(struct map_lookup *map) | ||
| 23 | { | ||
| 24 | if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
| 25 | return 1; | ||
| 26 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
| 27 | return 2; | ||
| 28 | else | ||
| 29 | return 0; | ||
| 30 | } | ||
| 31 | |||
| 32 | static inline int nr_data_stripes(struct map_lookup *map) | ||
| 33 | { | ||
| 34 | return map->num_stripes - nr_parity_stripes(map); | ||
| 35 | } | ||
| 36 | #define RAID5_P_STRIPE ((u64)-2) | ||
| 37 | #define RAID6_Q_STRIPE ((u64)-1) | ||
| 38 | |||
| 39 | #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ | ||
| 40 | ((x) == RAID6_Q_STRIPE)) | ||
| 41 | |||
| 42 | int raid56_parity_recover(struct btrfs_root *root, struct bio *bio, | ||
| 43 | struct btrfs_bio *bbio, u64 *raid_map, | ||
| 44 | u64 stripe_len, int mirror_num); | ||
| 45 | int raid56_parity_write(struct btrfs_root *root, struct bio *bio, | ||
| 46 | struct btrfs_bio *bbio, u64 *raid_map, | ||
| 47 | u64 stripe_len); | ||
| 48 | |||
| 49 | int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); | ||
| 50 | void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); | ||
| 51 | #endif | ||
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 17c306bf177a..50695dc5e2ab 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c | |||
| @@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode, | |||
| 3017 | } | 3017 | } |
| 3018 | } | 3018 | } |
| 3019 | 3019 | ||
| 3020 | page_start = (u64)page->index << PAGE_CACHE_SHIFT; | 3020 | page_start = page_offset(page); |
| 3021 | page_end = page_start + PAGE_CACHE_SIZE - 1; | 3021 | page_end = page_start + PAGE_CACHE_SIZE - 1; |
| 3022 | 3022 | ||
| 3023 | lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); | 3023 | lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end); |
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 67783e03d121..53c3501fa4ca 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
| @@ -28,6 +28,7 @@ | |||
| 28 | #include "dev-replace.h" | 28 | #include "dev-replace.h" |
| 29 | #include "check-integrity.h" | 29 | #include "check-integrity.h" |
| 30 | #include "rcu-string.h" | 30 | #include "rcu-string.h" |
| 31 | #include "raid56.h" | ||
| 31 | 32 | ||
| 32 | /* | 33 | /* |
| 33 | * This is only the first step towards a full-features scrub. It reads all | 34 | * This is only the first step towards a full-features scrub. It reads all |
| @@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, | |||
| 2254 | struct btrfs_device *extent_dev; | 2255 | struct btrfs_device *extent_dev; |
| 2255 | int extent_mirror_num; | 2256 | int extent_mirror_num; |
| 2256 | 2257 | ||
| 2258 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
| 2259 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
| 2260 | if (num >= nr_data_stripes(map)) { | ||
| 2261 | return 0; | ||
| 2262 | } | ||
| 2263 | } | ||
| 2264 | |||
| 2257 | nstripes = length; | 2265 | nstripes = length; |
| 2258 | offset = 0; | 2266 | offset = 0; |
| 2259 | do_div(nstripes, map->stripe_len); | 2267 | do_div(nstripes, map->stripe_len); |
| @@ -2708,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, | |||
| 2708 | int ret; | 2716 | int ret; |
| 2709 | struct btrfs_root *root = sctx->dev_root; | 2717 | struct btrfs_root *root = sctx->dev_root; |
| 2710 | 2718 | ||
| 2711 | if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) | 2719 | if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) |
| 2712 | return -EIO; | 2720 | return -EIO; |
| 2713 | 2721 | ||
| 2714 | gen = root->fs_info->last_trans_committed; | 2722 | gen = root->fs_info->last_trans_committed; |
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f4ab7a9260eb..f7a8b861058b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c | |||
| @@ -85,6 +85,7 @@ struct send_ctx { | |||
| 85 | u32 send_max_size; | 85 | u32 send_max_size; |
| 86 | u64 total_send_size; | 86 | u64 total_send_size; |
| 87 | u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; | 87 | u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; |
| 88 | u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ | ||
| 88 | 89 | ||
| 89 | struct vfsmount *mnt; | 90 | struct vfsmount *mnt; |
| 90 | 91 | ||
| @@ -3709,6 +3710,39 @@ out: | |||
| 3709 | return ret; | 3710 | return ret; |
| 3710 | } | 3711 | } |
| 3711 | 3712 | ||
| 3713 | /* | ||
| 3714 | * Send an update extent command to user space. | ||
| 3715 | */ | ||
| 3716 | static int send_update_extent(struct send_ctx *sctx, | ||
| 3717 | u64 offset, u32 len) | ||
| 3718 | { | ||
| 3719 | int ret = 0; | ||
| 3720 | struct fs_path *p; | ||
| 3721 | |||
| 3722 | p = fs_path_alloc(sctx); | ||
| 3723 | if (!p) | ||
| 3724 | return -ENOMEM; | ||
| 3725 | |||
| 3726 | ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); | ||
| 3727 | if (ret < 0) | ||
| 3728 | goto out; | ||
| 3729 | |||
| 3730 | ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); | ||
| 3731 | if (ret < 0) | ||
| 3732 | goto out; | ||
| 3733 | |||
| 3734 | TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); | ||
| 3735 | TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); | ||
| 3736 | TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len); | ||
| 3737 | |||
| 3738 | ret = send_cmd(sctx); | ||
| 3739 | |||
| 3740 | tlv_put_failure: | ||
| 3741 | out: | ||
| 3742 | fs_path_free(sctx, p); | ||
| 3743 | return ret; | ||
| 3744 | } | ||
| 3745 | |||
| 3712 | static int send_write_or_clone(struct send_ctx *sctx, | 3746 | static int send_write_or_clone(struct send_ctx *sctx, |
| 3713 | struct btrfs_path *path, | 3747 | struct btrfs_path *path, |
| 3714 | struct btrfs_key *key, | 3748 | struct btrfs_key *key, |
| @@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx, | |||
| 3744 | goto out; | 3778 | goto out; |
| 3745 | } | 3779 | } |
| 3746 | 3780 | ||
| 3747 | if (!clone_root) { | 3781 | if (clone_root) { |
| 3782 | ret = send_clone(sctx, offset, len, clone_root); | ||
| 3783 | } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { | ||
| 3784 | ret = send_update_extent(sctx, offset, len); | ||
| 3785 | } else { | ||
| 3748 | while (pos < len) { | 3786 | while (pos < len) { |
| 3749 | l = len - pos; | 3787 | l = len - pos; |
| 3750 | if (l > BTRFS_SEND_READ_SIZE) | 3788 | if (l > BTRFS_SEND_READ_SIZE) |
| @@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx, | |||
| 3757 | pos += ret; | 3795 | pos += ret; |
| 3758 | } | 3796 | } |
| 3759 | ret = 0; | 3797 | ret = 0; |
| 3760 | } else { | ||
| 3761 | ret = send_clone(sctx, offset, len, clone_root); | ||
| 3762 | } | 3798 | } |
| 3763 | |||
| 3764 | out: | 3799 | out: |
| 3765 | return ret; | 3800 | return ret; |
| 3766 | } | 3801 | } |
| @@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
| 4536 | struct btrfs_fs_info *fs_info; | 4571 | struct btrfs_fs_info *fs_info; |
| 4537 | struct btrfs_ioctl_send_args *arg = NULL; | 4572 | struct btrfs_ioctl_send_args *arg = NULL; |
| 4538 | struct btrfs_key key; | 4573 | struct btrfs_key key; |
| 4539 | struct file *filp = NULL; | ||
| 4540 | struct send_ctx *sctx = NULL; | 4574 | struct send_ctx *sctx = NULL; |
| 4541 | u32 i; | 4575 | u32 i; |
| 4542 | u64 *clone_sources_tmp = NULL; | 4576 | u64 *clone_sources_tmp = NULL; |
| @@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
| 4561 | goto out; | 4595 | goto out; |
| 4562 | } | 4596 | } |
| 4563 | 4597 | ||
| 4598 | if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) { | ||
| 4599 | ret = -EINVAL; | ||
| 4600 | goto out; | ||
| 4601 | } | ||
| 4602 | |||
| 4564 | sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); | 4603 | sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS); |
| 4565 | if (!sctx) { | 4604 | if (!sctx) { |
| 4566 | ret = -ENOMEM; | 4605 | ret = -ENOMEM; |
| @@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
| 4572 | INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); | 4611 | INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS); |
| 4573 | INIT_LIST_HEAD(&sctx->name_cache_list); | 4612 | INIT_LIST_HEAD(&sctx->name_cache_list); |
| 4574 | 4613 | ||
| 4614 | sctx->flags = arg->flags; | ||
| 4615 | |||
| 4575 | sctx->send_filp = fget(arg->send_fd); | 4616 | sctx->send_filp = fget(arg->send_fd); |
| 4576 | if (IS_ERR(sctx->send_filp)) { | 4617 | if (IS_ERR(sctx->send_filp)) { |
| 4577 | ret = PTR_ERR(sctx->send_filp); | 4618 | ret = PTR_ERR(sctx->send_filp); |
| @@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) | |||
| 4673 | goto out; | 4714 | goto out; |
| 4674 | 4715 | ||
| 4675 | out: | 4716 | out: |
| 4676 | if (filp) | ||
| 4677 | fput(filp); | ||
| 4678 | kfree(arg); | 4717 | kfree(arg); |
| 4679 | vfree(clone_sources_tmp); | 4718 | vfree(clone_sources_tmp); |
| 4680 | 4719 | ||
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 1bf4f32fd4ef..8bb18f7ccaa6 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h | |||
| @@ -86,6 +86,7 @@ enum btrfs_send_cmd { | |||
| 86 | BTRFS_SEND_C_UTIMES, | 86 | BTRFS_SEND_C_UTIMES, |
| 87 | 87 | ||
| 88 | BTRFS_SEND_C_END, | 88 | BTRFS_SEND_C_END, |
| 89 | BTRFS_SEND_C_UPDATE_EXTENT, | ||
| 89 | __BTRFS_SEND_C_MAX, | 90 | __BTRFS_SEND_C_MAX, |
| 90 | }; | 91 | }; |
| 91 | #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) | 92 | #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) |
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d8982e9601d3..68a29a1ea068 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c | |||
| @@ -41,13 +41,13 @@ | |||
| 41 | #include <linux/slab.h> | 41 | #include <linux/slab.h> |
| 42 | #include <linux/cleancache.h> | 42 | #include <linux/cleancache.h> |
| 43 | #include <linux/ratelimit.h> | 43 | #include <linux/ratelimit.h> |
| 44 | #include <linux/btrfs.h> | ||
| 44 | #include "compat.h" | 45 | #include "compat.h" |
| 45 | #include "delayed-inode.h" | 46 | #include "delayed-inode.h" |
| 46 | #include "ctree.h" | 47 | #include "ctree.h" |
| 47 | #include "disk-io.h" | 48 | #include "disk-io.h" |
| 48 | #include "transaction.h" | 49 | #include "transaction.h" |
| 49 | #include "btrfs_inode.h" | 50 | #include "btrfs_inode.h" |
| 50 | #include "ioctl.h" | ||
| 51 | #include "print-tree.h" | 51 | #include "print-tree.h" |
| 52 | #include "xattr.h" | 52 | #include "xattr.h" |
| 53 | #include "volumes.h" | 53 | #include "volumes.h" |
| @@ -63,8 +63,7 @@ | |||
| 63 | static const struct super_operations btrfs_super_ops; | 63 | static const struct super_operations btrfs_super_ops; |
| 64 | static struct file_system_type btrfs_fs_type; | 64 | static struct file_system_type btrfs_fs_type; |
| 65 | 65 | ||
| 66 | static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, | 66 | static const char *btrfs_decode_error(int errno, char nbuf[16]) |
| 67 | char nbuf[16]) | ||
| 68 | { | 67 | { |
| 69 | char *errstr = NULL; | 68 | char *errstr = NULL; |
| 70 | 69 | ||
| @@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info) | |||
| 98 | * today we only save the error info into ram. Long term we'll | 97 | * today we only save the error info into ram. Long term we'll |
| 99 | * also send it down to the disk | 98 | * also send it down to the disk |
| 100 | */ | 99 | */ |
| 101 | fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR; | 100 | set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); |
| 102 | } | 101 | } |
| 103 | 102 | ||
| 104 | static void save_error_info(struct btrfs_fs_info *fs_info) | 103 | static void save_error_info(struct btrfs_fs_info *fs_info) |
| @@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) | |||
| 114 | if (sb->s_flags & MS_RDONLY) | 113 | if (sb->s_flags & MS_RDONLY) |
| 115 | return; | 114 | return; |
| 116 | 115 | ||
| 117 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 116 | if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { |
| 118 | sb->s_flags |= MS_RDONLY; | 117 | sb->s_flags |= MS_RDONLY; |
| 119 | printk(KERN_INFO "btrfs is forced readonly\n"); | 118 | printk(KERN_INFO "btrfs is forced readonly\n"); |
| 120 | /* | 119 | /* |
| @@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, | |||
| 142 | struct super_block *sb = fs_info->sb; | 141 | struct super_block *sb = fs_info->sb; |
| 143 | char nbuf[16]; | 142 | char nbuf[16]; |
| 144 | const char *errstr; | 143 | const char *errstr; |
| 145 | va_list args; | ||
| 146 | va_start(args, fmt); | ||
| 147 | 144 | ||
| 148 | /* | 145 | /* |
| 149 | * Special case: if the error is EROFS, and we're already | 146 | * Special case: if the error is EROFS, and we're already |
| @@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, | |||
| 152 | if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) | 149 | if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) |
| 153 | return; | 150 | return; |
| 154 | 151 | ||
| 155 | errstr = btrfs_decode_error(fs_info, errno, nbuf); | 152 | errstr = btrfs_decode_error(errno, nbuf); |
| 156 | if (fmt) { | 153 | if (fmt) { |
| 157 | struct va_format vaf = { | 154 | struct va_format vaf; |
| 158 | .fmt = fmt, | 155 | va_list args; |
| 159 | .va = &args, | 156 | |
| 160 | }; | 157 | va_start(args, fmt); |
| 158 | vaf.fmt = fmt; | ||
| 159 | vaf.va = &args; | ||
| 161 | 160 | ||
| 162 | printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", | 161 | printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n", |
| 163 | sb->s_id, function, line, errstr, &vaf); | 162 | sb->s_id, function, line, errstr, &vaf); |
| 163 | va_end(args); | ||
| 164 | } else { | 164 | } else { |
| 165 | printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", | 165 | printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n", |
| 166 | sb->s_id, function, line, errstr); | 166 | sb->s_id, function, line, errstr); |
| @@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, | |||
| 171 | save_error_info(fs_info); | 171 | save_error_info(fs_info); |
| 172 | btrfs_handle_error(fs_info); | 172 | btrfs_handle_error(fs_info); |
| 173 | } | 173 | } |
| 174 | va_end(args); | ||
| 175 | } | 174 | } |
| 176 | 175 | ||
| 177 | static const char * const logtypes[] = { | 176 | static const char * const logtypes[] = { |
| @@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, | |||
| 261 | char nbuf[16]; | 260 | char nbuf[16]; |
| 262 | const char *errstr; | 261 | const char *errstr; |
| 263 | 262 | ||
| 264 | errstr = btrfs_decode_error(root->fs_info, errno, nbuf); | 263 | errstr = btrfs_decode_error(errno, nbuf); |
| 265 | btrfs_printk(root->fs_info, | 264 | btrfs_printk(root->fs_info, |
| 266 | "%s:%d: Aborting unused transaction(%s).\n", | 265 | "%s:%d: Aborting unused transaction(%s).\n", |
| 267 | function, line, errstr); | 266 | function, line, errstr); |
| @@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, | |||
| 289 | va_start(args, fmt); | 288 | va_start(args, fmt); |
| 290 | vaf.va = &args; | 289 | vaf.va = &args; |
| 291 | 290 | ||
| 292 | errstr = btrfs_decode_error(fs_info, errno, nbuf); | 291 | errstr = btrfs_decode_error(errno, nbuf); |
| 293 | if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR) | 292 | if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)) |
| 294 | panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", | 293 | panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n", |
| 295 | s_id, function, line, &vaf, errstr); | 294 | s_id, function, line, &vaf, errstr); |
| 296 | 295 | ||
| @@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
| 438 | case Opt_compress_force: | 437 | case Opt_compress_force: |
| 439 | case Opt_compress_force_type: | 438 | case Opt_compress_force_type: |
| 440 | compress_force = true; | 439 | compress_force = true; |
| 440 | /* Fallthrough */ | ||
| 441 | case Opt_compress: | 441 | case Opt_compress: |
| 442 | case Opt_compress_type: | 442 | case Opt_compress_type: |
| 443 | if (token == Opt_compress || | 443 | if (token == Opt_compress || |
| @@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) | |||
| 519 | case Opt_alloc_start: | 519 | case Opt_alloc_start: |
| 520 | num = match_strdup(&args[0]); | 520 | num = match_strdup(&args[0]); |
| 521 | if (num) { | 521 | if (num) { |
| 522 | mutex_lock(&info->chunk_mutex); | ||
| 522 | info->alloc_start = memparse(num, NULL); | 523 | info->alloc_start = memparse(num, NULL); |
| 524 | mutex_unlock(&info->chunk_mutex); | ||
| 523 | kfree(num); | 525 | kfree(num); |
| 524 | printk(KERN_INFO | 526 | printk(KERN_INFO |
| 525 | "btrfs: allocations start at %llu\n", | 527 | "btrfs: allocations start at %llu\n", |
| @@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait) | |||
| 876 | 878 | ||
| 877 | btrfs_wait_ordered_extents(root, 0); | 879 | btrfs_wait_ordered_extents(root, 0); |
| 878 | 880 | ||
| 879 | trans = btrfs_attach_transaction(root); | 881 | trans = btrfs_attach_transaction_barrier(root); |
| 880 | if (IS_ERR(trans)) { | 882 | if (IS_ERR(trans)) { |
| 881 | /* no transaction, don't bother */ | 883 | /* no transaction, don't bother */ |
| 882 | if (PTR_ERR(trans) == -ENOENT) | 884 | if (PTR_ERR(trans) == -ENOENT) |
| @@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, | |||
| 1200 | new_pool_size); | 1202 | new_pool_size); |
| 1201 | } | 1203 | } |
| 1202 | 1204 | ||
| 1205 | static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info, | ||
| 1206 | unsigned long old_opts, int flags) | ||
| 1207 | { | ||
| 1208 | set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); | ||
| 1209 | |||
| 1210 | if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && | ||
| 1211 | (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || | ||
| 1212 | (flags & MS_RDONLY))) { | ||
| 1213 | /* wait for any defraggers to finish */ | ||
| 1214 | wait_event(fs_info->transaction_wait, | ||
| 1215 | (atomic_read(&fs_info->defrag_running) == 0)); | ||
| 1216 | if (flags & MS_RDONLY) | ||
| 1217 | sync_filesystem(fs_info->sb); | ||
| 1218 | } | ||
| 1219 | } | ||
| 1220 | |||
| 1221 | static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, | ||
| 1222 | unsigned long old_opts) | ||
| 1223 | { | ||
| 1224 | /* | ||
| 1225 | * We need cleanup all defragable inodes if the autodefragment is | ||
| 1226 | * close or the fs is R/O. | ||
| 1227 | */ | ||
| 1228 | if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && | ||
| 1229 | (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || | ||
| 1230 | (fs_info->sb->s_flags & MS_RDONLY))) { | ||
| 1231 | btrfs_cleanup_defrag_inodes(fs_info); | ||
| 1232 | } | ||
| 1233 | |||
| 1234 | clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); | ||
| 1235 | } | ||
| 1236 | |||
| 1203 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) | 1237 | static int btrfs_remount(struct super_block *sb, int *flags, char *data) |
| 1204 | { | 1238 | { |
| 1205 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); | 1239 | struct btrfs_fs_info *fs_info = btrfs_sb(sb); |
| @@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
| 1213 | unsigned int old_metadata_ratio = fs_info->metadata_ratio; | 1247 | unsigned int old_metadata_ratio = fs_info->metadata_ratio; |
| 1214 | int ret; | 1248 | int ret; |
| 1215 | 1249 | ||
| 1250 | btrfs_remount_prepare(fs_info, old_opts, *flags); | ||
| 1251 | |||
| 1216 | ret = btrfs_parse_options(root, data); | 1252 | ret = btrfs_parse_options(root, data); |
| 1217 | if (ret) { | 1253 | if (ret) { |
| 1218 | ret = -EINVAL; | 1254 | ret = -EINVAL; |
| @@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
| 1223 | fs_info->thread_pool_size, old_thread_pool_size); | 1259 | fs_info->thread_pool_size, old_thread_pool_size); |
| 1224 | 1260 | ||
| 1225 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) | 1261 | if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) |
| 1226 | return 0; | 1262 | goto out; |
| 1227 | 1263 | ||
| 1228 | if (*flags & MS_RDONLY) { | 1264 | if (*flags & MS_RDONLY) { |
| 1229 | /* | 1265 | /* |
| @@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) | |||
| 1278 | } | 1314 | } |
| 1279 | sb->s_flags &= ~MS_RDONLY; | 1315 | sb->s_flags &= ~MS_RDONLY; |
| 1280 | } | 1316 | } |
| 1281 | 1317 | out: | |
| 1318 | btrfs_remount_cleanup(fs_info, old_opts); | ||
| 1282 | return 0; | 1319 | return 0; |
| 1283 | 1320 | ||
| 1284 | restore: | 1321 | restore: |
| @@ -1289,10 +1326,13 @@ restore: | |||
| 1289 | fs_info->mount_opt = old_opts; | 1326 | fs_info->mount_opt = old_opts; |
| 1290 | fs_info->compress_type = old_compress_type; | 1327 | fs_info->compress_type = old_compress_type; |
| 1291 | fs_info->max_inline = old_max_inline; | 1328 | fs_info->max_inline = old_max_inline; |
| 1329 | mutex_lock(&fs_info->chunk_mutex); | ||
| 1292 | fs_info->alloc_start = old_alloc_start; | 1330 | fs_info->alloc_start = old_alloc_start; |
| 1331 | mutex_unlock(&fs_info->chunk_mutex); | ||
| 1293 | btrfs_resize_thread_pool(fs_info, | 1332 | btrfs_resize_thread_pool(fs_info, |
| 1294 | old_thread_pool_size, fs_info->thread_pool_size); | 1333 | old_thread_pool_size, fs_info->thread_pool_size); |
| 1295 | fs_info->metadata_ratio = old_metadata_ratio; | 1334 | fs_info->metadata_ratio = old_metadata_ratio; |
| 1335 | btrfs_remount_cleanup(fs_info, old_opts); | ||
| 1296 | return ret; | 1336 | return ret; |
| 1297 | } | 1337 | } |
| 1298 | 1338 | ||
| @@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb) | |||
| 1559 | struct btrfs_trans_handle *trans; | 1599 | struct btrfs_trans_handle *trans; |
| 1560 | struct btrfs_root *root = btrfs_sb(sb)->tree_root; | 1600 | struct btrfs_root *root = btrfs_sb(sb)->tree_root; |
| 1561 | 1601 | ||
| 1562 | trans = btrfs_attach_transaction(root); | 1602 | trans = btrfs_attach_transaction_barrier(root); |
| 1563 | if (IS_ERR(trans)) { | 1603 | if (IS_ERR(trans)) { |
| 1564 | /* no transaction, don't bother */ | 1604 | /* no transaction, don't bother */ |
| 1565 | if (PTR_ERR(trans) == -ENOENT) | 1605 | if (PTR_ERR(trans) == -ENOENT) |
| @@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void) | |||
| 1684 | if (err) | 1724 | if (err) |
| 1685 | goto free_delayed_inode; | 1725 | goto free_delayed_inode; |
| 1686 | 1726 | ||
| 1687 | err = btrfs_interface_init(); | 1727 | err = btrfs_delayed_ref_init(); |
| 1688 | if (err) | 1728 | if (err) |
| 1689 | goto free_auto_defrag; | 1729 | goto free_auto_defrag; |
| 1690 | 1730 | ||
| 1731 | err = btrfs_interface_init(); | ||
| 1732 | if (err) | ||
| 1733 | goto free_delayed_ref; | ||
| 1734 | |||
| 1691 | err = register_filesystem(&btrfs_fs_type); | 1735 | err = register_filesystem(&btrfs_fs_type); |
| 1692 | if (err) | 1736 | if (err) |
| 1693 | goto unregister_ioctl; | 1737 | goto unregister_ioctl; |
| @@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void) | |||
| 1699 | 1743 | ||
| 1700 | unregister_ioctl: | 1744 | unregister_ioctl: |
| 1701 | btrfs_interface_exit(); | 1745 | btrfs_interface_exit(); |
| 1746 | free_delayed_ref: | ||
| 1747 | btrfs_delayed_ref_exit(); | ||
| 1702 | free_auto_defrag: | 1748 | free_auto_defrag: |
| 1703 | btrfs_auto_defrag_exit(); | 1749 | btrfs_auto_defrag_exit(); |
| 1704 | free_delayed_inode: | 1750 | free_delayed_inode: |
| @@ -1720,6 +1766,7 @@ free_compress: | |||
| 1720 | static void __exit exit_btrfs_fs(void) | 1766 | static void __exit exit_btrfs_fs(void) |
| 1721 | { | 1767 | { |
| 1722 | btrfs_destroy_cachep(); | 1768 | btrfs_destroy_cachep(); |
| 1769 | btrfs_delayed_ref_exit(); | ||
| 1723 | btrfs_auto_defrag_exit(); | 1770 | btrfs_auto_defrag_exit(); |
| 1724 | btrfs_delayed_inode_exit(); | 1771 | btrfs_delayed_inode_exit(); |
| 1725 | ordered_data_exit(); | 1772 | ordered_data_exit(); |
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index daac9ae6d731..5b326cd60a4a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c | |||
| @@ -21,7 +21,6 @@ | |||
| 21 | #include <linux/spinlock.h> | 21 | #include <linux/spinlock.h> |
| 22 | #include <linux/completion.h> | 22 | #include <linux/completion.h> |
| 23 | #include <linux/buffer_head.h> | 23 | #include <linux/buffer_head.h> |
| 24 | #include <linux/module.h> | ||
| 25 | #include <linux/kobject.h> | 24 | #include <linux/kobject.h> |
| 26 | 25 | ||
| 27 | #include "ctree.h" | 26 | #include "ctree.h" |
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 4c0067c4f76d..e52da6fb1165 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c | |||
| @@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction) | |||
| 40 | if (atomic_dec_and_test(&transaction->use_count)) { | 40 | if (atomic_dec_and_test(&transaction->use_count)) { |
| 41 | BUG_ON(!list_empty(&transaction->list)); | 41 | BUG_ON(!list_empty(&transaction->list)); |
| 42 | WARN_ON(transaction->delayed_refs.root.rb_node); | 42 | WARN_ON(transaction->delayed_refs.root.rb_node); |
| 43 | memset(transaction, 0, sizeof(*transaction)); | ||
| 44 | kmem_cache_free(btrfs_transaction_cachep, transaction); | 43 | kmem_cache_free(btrfs_transaction_cachep, transaction); |
| 45 | } | 44 | } |
| 46 | } | 45 | } |
| @@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root) | |||
| 51 | root->commit_root = btrfs_root_node(root); | 50 | root->commit_root = btrfs_root_node(root); |
| 52 | } | 51 | } |
| 53 | 52 | ||
| 53 | static inline int can_join_transaction(struct btrfs_transaction *trans, | ||
| 54 | int type) | ||
| 55 | { | ||
| 56 | return !(trans->in_commit && | ||
| 57 | type != TRANS_JOIN && | ||
| 58 | type != TRANS_JOIN_NOLOCK); | ||
| 59 | } | ||
| 60 | |||
| 54 | /* | 61 | /* |
| 55 | * either allocate a new transaction or hop into the existing one | 62 | * either allocate a new transaction or hop into the existing one |
| 56 | */ | 63 | */ |
| @@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type) | |||
| 62 | spin_lock(&fs_info->trans_lock); | 69 | spin_lock(&fs_info->trans_lock); |
| 63 | loop: | 70 | loop: |
| 64 | /* The file system has been taken offline. No new transactions. */ | 71 | /* The file system has been taken offline. No new transactions. */ |
| 65 | if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 72 | if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { |
| 66 | spin_unlock(&fs_info->trans_lock); | 73 | spin_unlock(&fs_info->trans_lock); |
| 67 | return -EROFS; | 74 | return -EROFS; |
| 68 | } | 75 | } |
| @@ -86,6 +93,10 @@ loop: | |||
| 86 | spin_unlock(&fs_info->trans_lock); | 93 | spin_unlock(&fs_info->trans_lock); |
| 87 | return cur_trans->aborted; | 94 | return cur_trans->aborted; |
| 88 | } | 95 | } |
| 96 | if (!can_join_transaction(cur_trans, type)) { | ||
| 97 | spin_unlock(&fs_info->trans_lock); | ||
| 98 | return -EBUSY; | ||
| 99 | } | ||
| 89 | atomic_inc(&cur_trans->use_count); | 100 | atomic_inc(&cur_trans->use_count); |
| 90 | atomic_inc(&cur_trans->num_writers); | 101 | atomic_inc(&cur_trans->num_writers); |
| 91 | cur_trans->num_joined++; | 102 | cur_trans->num_joined++; |
| @@ -113,7 +124,7 @@ loop: | |||
| 113 | */ | 124 | */ |
| 114 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); | 125 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); |
| 115 | goto loop; | 126 | goto loop; |
| 116 | } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 127 | } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) { |
| 117 | spin_unlock(&fs_info->trans_lock); | 128 | spin_unlock(&fs_info->trans_lock); |
| 118 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); | 129 | kmem_cache_free(btrfs_transaction_cachep, cur_trans); |
| 119 | return -EROFS; | 130 | return -EROFS; |
| @@ -155,8 +166,12 @@ loop: | |||
| 155 | 166 | ||
| 156 | spin_lock_init(&cur_trans->commit_lock); | 167 | spin_lock_init(&cur_trans->commit_lock); |
| 157 | spin_lock_init(&cur_trans->delayed_refs.lock); | 168 | spin_lock_init(&cur_trans->delayed_refs.lock); |
| 169 | atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0); | ||
| 170 | atomic_set(&cur_trans->delayed_refs.ref_seq, 0); | ||
| 171 | init_waitqueue_head(&cur_trans->delayed_refs.wait); | ||
| 158 | 172 | ||
| 159 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); | 173 | INIT_LIST_HEAD(&cur_trans->pending_snapshots); |
| 174 | INIT_LIST_HEAD(&cur_trans->ordered_operations); | ||
| 160 | list_add_tail(&cur_trans->list, &fs_info->trans_list); | 175 | list_add_tail(&cur_trans->list, &fs_info->trans_list); |
| 161 | extent_io_tree_init(&cur_trans->dirty_pages, | 176 | extent_io_tree_init(&cur_trans->dirty_pages, |
| 162 | fs_info->btree_inode->i_mapping); | 177 | fs_info->btree_inode->i_mapping); |
| @@ -301,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type, | |||
| 301 | int ret; | 316 | int ret; |
| 302 | u64 qgroup_reserved = 0; | 317 | u64 qgroup_reserved = 0; |
| 303 | 318 | ||
| 304 | if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) | 319 | if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) |
| 305 | return ERR_PTR(-EROFS); | 320 | return ERR_PTR(-EROFS); |
| 306 | 321 | ||
| 307 | if (current->journal_info) { | 322 | if (current->journal_info) { |
| @@ -359,8 +374,11 @@ again: | |||
| 359 | 374 | ||
| 360 | do { | 375 | do { |
| 361 | ret = join_transaction(root, type); | 376 | ret = join_transaction(root, type); |
| 362 | if (ret == -EBUSY) | 377 | if (ret == -EBUSY) { |
| 363 | wait_current_trans(root); | 378 | wait_current_trans(root); |
| 379 | if (unlikely(type == TRANS_ATTACH)) | ||
| 380 | ret = -ENOENT; | ||
| 381 | } | ||
| 364 | } while (ret == -EBUSY); | 382 | } while (ret == -EBUSY); |
| 365 | 383 | ||
| 366 | if (ret < 0) { | 384 | if (ret < 0) { |
| @@ -382,9 +400,10 @@ again: | |||
| 382 | h->block_rsv = NULL; | 400 | h->block_rsv = NULL; |
| 383 | h->orig_rsv = NULL; | 401 | h->orig_rsv = NULL; |
| 384 | h->aborted = 0; | 402 | h->aborted = 0; |
| 385 | h->qgroup_reserved = qgroup_reserved; | 403 | h->qgroup_reserved = 0; |
| 386 | h->delayed_ref_elem.seq = 0; | 404 | h->delayed_ref_elem.seq = 0; |
| 387 | h->type = type; | 405 | h->type = type; |
| 406 | h->allocating_chunk = false; | ||
| 388 | INIT_LIST_HEAD(&h->qgroup_ref_list); | 407 | INIT_LIST_HEAD(&h->qgroup_ref_list); |
| 389 | INIT_LIST_HEAD(&h->new_bgs); | 408 | INIT_LIST_HEAD(&h->new_bgs); |
| 390 | 409 | ||
| @@ -400,6 +419,7 @@ again: | |||
| 400 | h->block_rsv = &root->fs_info->trans_block_rsv; | 419 | h->block_rsv = &root->fs_info->trans_block_rsv; |
| 401 | h->bytes_reserved = num_bytes; | 420 | h->bytes_reserved = num_bytes; |
| 402 | } | 421 | } |
| 422 | h->qgroup_reserved = qgroup_reserved; | ||
| 403 | 423 | ||
| 404 | got_it: | 424 | got_it: |
| 405 | btrfs_record_root_in_trans(h, root); | 425 | btrfs_record_root_in_trans(h, root); |
| @@ -451,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root | |||
| 451 | return start_transaction(root, 0, TRANS_USERSPACE, 0); | 471 | return start_transaction(root, 0, TRANS_USERSPACE, 0); |
| 452 | } | 472 | } |
| 453 | 473 | ||
| 474 | /* | ||
| 475 | * btrfs_attach_transaction() - catch the running transaction | ||
| 476 | * | ||
| 477 | * It is used when we want to commit the current the transaction, but | ||
| 478 | * don't want to start a new one. | ||
| 479 | * | ||
| 480 | * Note: If this function return -ENOENT, it just means there is no | ||
| 481 | * running transaction. But it is possible that the inactive transaction | ||
| 482 | * is still in the memory, not fully on disk. If you hope there is no | ||
| 483 | * inactive transaction in the fs when -ENOENT is returned, you should | ||
| 484 | * invoke | ||
| 485 | * btrfs_attach_transaction_barrier() | ||
| 486 | */ | ||
| 454 | struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) | 487 | struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) |
| 455 | { | 488 | { |
| 456 | return start_transaction(root, 0, TRANS_ATTACH, 0); | 489 | return start_transaction(root, 0, TRANS_ATTACH, 0); |
| 457 | } | 490 | } |
| 458 | 491 | ||
| 492 | /* | ||
| 493 | * btrfs_attach_transaction() - catch the running transaction | ||
| 494 | * | ||
| 495 | * It is similar to the above function, the differentia is this one | ||
| 496 | * will wait for all the inactive transactions until they fully | ||
| 497 | * complete. | ||
| 498 | */ | ||
| 499 | struct btrfs_trans_handle * | ||
| 500 | btrfs_attach_transaction_barrier(struct btrfs_root *root) | ||
| 501 | { | ||
| 502 | struct btrfs_trans_handle *trans; | ||
| 503 | |||
| 504 | trans = start_transaction(root, 0, TRANS_ATTACH, 0); | ||
| 505 | if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) | ||
| 506 | btrfs_wait_for_commit(root, 0); | ||
| 507 | |||
| 508 | return trans; | ||
| 509 | } | ||
| 510 | |||
| 459 | /* wait for a transaction commit to be fully complete */ | 511 | /* wait for a transaction commit to be fully complete */ |
| 460 | static noinline void wait_for_commit(struct btrfs_root *root, | 512 | static noinline void wait_for_commit(struct btrfs_root *root, |
| 461 | struct btrfs_transaction *commit) | 513 | struct btrfs_transaction *commit) |
| @@ -587,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
| 587 | if (!list_empty(&trans->new_bgs)) | 639 | if (!list_empty(&trans->new_bgs)) |
| 588 | btrfs_create_pending_block_groups(trans, root); | 640 | btrfs_create_pending_block_groups(trans, root); |
| 589 | 641 | ||
| 590 | while (count < 2) { | 642 | while (count < 1) { |
| 591 | unsigned long cur = trans->delayed_ref_updates; | 643 | unsigned long cur = trans->delayed_ref_updates; |
| 592 | trans->delayed_ref_updates = 0; | 644 | trans->delayed_ref_updates = 0; |
| 593 | if (cur && | 645 | if (cur && |
| @@ -599,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
| 599 | } | 651 | } |
| 600 | count++; | 652 | count++; |
| 601 | } | 653 | } |
| 654 | |||
| 602 | btrfs_trans_release_metadata(trans, root); | 655 | btrfs_trans_release_metadata(trans, root); |
| 603 | trans->block_rsv = NULL; | 656 | trans->block_rsv = NULL; |
| 604 | 657 | ||
| @@ -644,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, | |||
| 644 | btrfs_run_delayed_iputs(root); | 697 | btrfs_run_delayed_iputs(root); |
| 645 | 698 | ||
| 646 | if (trans->aborted || | 699 | if (trans->aborted || |
| 647 | root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) { | 700 | test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) |
| 648 | err = -EIO; | 701 | err = -EIO; |
| 649 | } | ||
| 650 | assert_qgroups_uptodate(trans); | 702 | assert_qgroups_uptodate(trans); |
| 651 | 703 | ||
| 652 | memset(trans, 0, sizeof(*trans)); | ||
| 653 | kmem_cache_free(btrfs_trans_handle_cachep, trans); | 704 | kmem_cache_free(btrfs_trans_handle_cachep, trans); |
| 654 | return err; | 705 | return err; |
| 655 | } | 706 | } |
| @@ -696,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
| 696 | struct extent_state *cached_state = NULL; | 747 | struct extent_state *cached_state = NULL; |
| 697 | u64 start = 0; | 748 | u64 start = 0; |
| 698 | u64 end; | 749 | u64 end; |
| 750 | struct blk_plug plug; | ||
| 699 | 751 | ||
| 752 | blk_start_plug(&plug); | ||
| 700 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, | 753 | while (!find_first_extent_bit(dirty_pages, start, &start, &end, |
| 701 | mark, &cached_state)) { | 754 | mark, &cached_state)) { |
| 702 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, | 755 | convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, |
| @@ -710,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, | |||
| 710 | } | 763 | } |
| 711 | if (err) | 764 | if (err) |
| 712 | werr = err; | 765 | werr = err; |
| 766 | blk_finish_plug(&plug); | ||
| 713 | return werr; | 767 | return werr; |
| 714 | } | 768 | } |
| 715 | 769 | ||
| @@ -960,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, | |||
| 960 | } | 1014 | } |
| 961 | 1015 | ||
| 962 | /* | 1016 | /* |
| 963 | * defrag a given btree. If cacheonly == 1, this won't read from the disk, | 1017 | * defrag a given btree. |
| 964 | * otherwise every leaf in the btree is read and defragged. | 1018 | * Every leaf in the btree is read and defragged. |
| 965 | */ | 1019 | */ |
| 966 | int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | 1020 | int btrfs_defrag_root(struct btrfs_root *root) |
| 967 | { | 1021 | { |
| 968 | struct btrfs_fs_info *info = root->fs_info; | 1022 | struct btrfs_fs_info *info = root->fs_info; |
| 969 | struct btrfs_trans_handle *trans; | 1023 | struct btrfs_trans_handle *trans; |
| @@ -977,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | |||
| 977 | if (IS_ERR(trans)) | 1031 | if (IS_ERR(trans)) |
| 978 | return PTR_ERR(trans); | 1032 | return PTR_ERR(trans); |
| 979 | 1033 | ||
| 980 | ret = btrfs_defrag_leaves(trans, root, cacheonly); | 1034 | ret = btrfs_defrag_leaves(trans, root); |
| 981 | 1035 | ||
| 982 | btrfs_end_transaction(trans, root); | 1036 | btrfs_end_transaction(trans, root); |
| 983 | btrfs_btree_balance_dirty(info->tree_root); | 1037 | btrfs_btree_balance_dirty(info->tree_root); |
| @@ -985,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) | |||
| 985 | 1039 | ||
| 986 | if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) | 1040 | if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN) |
| 987 | break; | 1041 | break; |
| 1042 | |||
| 1043 | if (btrfs_defrag_cancelled(root->fs_info)) { | ||
| 1044 | printk(KERN_DEBUG "btrfs: defrag_root cancelled\n"); | ||
| 1045 | ret = -EAGAIN; | ||
| 1046 | break; | ||
| 1047 | } | ||
| 988 | } | 1048 | } |
| 989 | root->defrag_running = 0; | 1049 | root->defrag_running = 0; |
| 990 | return ret; | 1050 | return ret; |
| @@ -1007,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
| 1007 | struct inode *parent_inode; | 1067 | struct inode *parent_inode; |
| 1008 | struct btrfs_path *path; | 1068 | struct btrfs_path *path; |
| 1009 | struct btrfs_dir_item *dir_item; | 1069 | struct btrfs_dir_item *dir_item; |
| 1010 | struct dentry *parent; | ||
| 1011 | struct dentry *dentry; | 1070 | struct dentry *dentry; |
| 1012 | struct extent_buffer *tmp; | 1071 | struct extent_buffer *tmp; |
| 1013 | struct extent_buffer *old; | 1072 | struct extent_buffer *old; |
| @@ -1022,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
| 1022 | path = btrfs_alloc_path(); | 1081 | path = btrfs_alloc_path(); |
| 1023 | if (!path) { | 1082 | if (!path) { |
| 1024 | ret = pending->error = -ENOMEM; | 1083 | ret = pending->error = -ENOMEM; |
| 1025 | goto path_alloc_fail; | 1084 | return ret; |
| 1026 | } | 1085 | } |
| 1027 | 1086 | ||
| 1028 | new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); | 1087 | new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); |
| @@ -1062,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
| 1062 | 1121 | ||
| 1063 | rsv = trans->block_rsv; | 1122 | rsv = trans->block_rsv; |
| 1064 | trans->block_rsv = &pending->block_rsv; | 1123 | trans->block_rsv = &pending->block_rsv; |
| 1124 | trans->bytes_reserved = trans->block_rsv->reserved; | ||
| 1065 | 1125 | ||
| 1066 | dentry = pending->dentry; | 1126 | dentry = pending->dentry; |
| 1067 | parent = dget_parent(dentry); | 1127 | parent_inode = pending->dir; |
| 1068 | parent_inode = parent->d_inode; | ||
| 1069 | parent_root = BTRFS_I(parent_inode)->root; | 1128 | parent_root = BTRFS_I(parent_inode)->root; |
| 1070 | record_root_in_trans(trans, parent_root); | 1129 | record_root_in_trans(trans, parent_root); |
| 1071 | 1130 | ||
| @@ -1213,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, | |||
| 1213 | if (ret) | 1272 | if (ret) |
| 1214 | btrfs_abort_transaction(trans, root, ret); | 1273 | btrfs_abort_transaction(trans, root, ret); |
| 1215 | fail: | 1274 | fail: |
| 1216 | dput(parent); | ||
| 1217 | trans->block_rsv = rsv; | 1275 | trans->block_rsv = rsv; |
| 1276 | trans->bytes_reserved = 0; | ||
| 1218 | no_free_objectid: | 1277 | no_free_objectid: |
| 1219 | kfree(new_root_item); | 1278 | kfree(new_root_item); |
| 1220 | root_item_alloc_fail: | 1279 | root_item_alloc_fail: |
| 1221 | btrfs_free_path(path); | 1280 | btrfs_free_path(path); |
| 1222 | path_alloc_fail: | ||
| 1223 | btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); | ||
| 1224 | return ret; | 1281 | return ret; |
| 1225 | } | 1282 | } |
| 1226 | 1283 | ||
| @@ -1306,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, | |||
| 1306 | struct btrfs_async_commit { | 1363 | struct btrfs_async_commit { |
| 1307 | struct btrfs_trans_handle *newtrans; | 1364 | struct btrfs_trans_handle *newtrans; |
| 1308 | struct btrfs_root *root; | 1365 | struct btrfs_root *root; |
| 1309 | struct delayed_work work; | 1366 | struct work_struct work; |
| 1310 | }; | 1367 | }; |
| 1311 | 1368 | ||
| 1312 | static void do_async_commit(struct work_struct *work) | 1369 | static void do_async_commit(struct work_struct *work) |
| 1313 | { | 1370 | { |
| 1314 | struct btrfs_async_commit *ac = | 1371 | struct btrfs_async_commit *ac = |
| 1315 | container_of(work, struct btrfs_async_commit, work.work); | 1372 | container_of(work, struct btrfs_async_commit, work); |
| 1316 | 1373 | ||
| 1317 | /* | 1374 | /* |
| 1318 | * We've got freeze protection passed with the transaction. | 1375 | * We've got freeze protection passed with the transaction. |
| @@ -1340,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, | |||
| 1340 | if (!ac) | 1397 | if (!ac) |
| 1341 | return -ENOMEM; | 1398 | return -ENOMEM; |
| 1342 | 1399 | ||
| 1343 | INIT_DELAYED_WORK(&ac->work, do_async_commit); | 1400 | INIT_WORK(&ac->work, do_async_commit); |
| 1344 | ac->root = root; | 1401 | ac->root = root; |
| 1345 | ac->newtrans = btrfs_join_transaction(root); | 1402 | ac->newtrans = btrfs_join_transaction(root); |
| 1346 | if (IS_ERR(ac->newtrans)) { | 1403 | if (IS_ERR(ac->newtrans)) { |
| @@ -1364,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, | |||
| 1364 | &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], | 1421 | &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], |
| 1365 | 1, _THIS_IP_); | 1422 | 1, _THIS_IP_); |
| 1366 | 1423 | ||
| 1367 | schedule_delayed_work(&ac->work, 0); | 1424 | schedule_work(&ac->work); |
| 1368 | 1425 | ||
| 1369 | /* wait for transaction to start and unblock */ | 1426 | /* wait for transaction to start and unblock */ |
| 1370 | if (wait_for_unblock) | 1427 | if (wait_for_unblock) |
| @@ -1384,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, | |||
| 1384 | struct btrfs_root *root, int err) | 1441 | struct btrfs_root *root, int err) |
| 1385 | { | 1442 | { |
| 1386 | struct btrfs_transaction *cur_trans = trans->transaction; | 1443 | struct btrfs_transaction *cur_trans = trans->transaction; |
| 1444 | DEFINE_WAIT(wait); | ||
| 1387 | 1445 | ||
| 1388 | WARN_ON(trans->use_count > 1); | 1446 | WARN_ON(trans->use_count > 1); |
| 1389 | 1447 | ||
| @@ -1392,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, | |||
| 1392 | spin_lock(&root->fs_info->trans_lock); | 1450 | spin_lock(&root->fs_info->trans_lock); |
| 1393 | list_del_init(&cur_trans->list); | 1451 | list_del_init(&cur_trans->list); |
| 1394 | if (cur_trans == root->fs_info->running_transaction) { | 1452 | if (cur_trans == root->fs_info->running_transaction) { |
| 1453 | root->fs_info->trans_no_join = 1; | ||
| 1454 | spin_unlock(&root->fs_info->trans_lock); | ||
| 1455 | wait_event(cur_trans->writer_wait, | ||
| 1456 | atomic_read(&cur_trans->num_writers) == 1); | ||
| 1457 | |||
| 1458 | spin_lock(&root->fs_info->trans_lock); | ||
| 1395 | root->fs_info->running_transaction = NULL; | 1459 | root->fs_info->running_transaction = NULL; |
| 1396 | root->fs_info->trans_no_join = 0; | ||
| 1397 | } | 1460 | } |
| 1398 | spin_unlock(&root->fs_info->trans_lock); | 1461 | spin_unlock(&root->fs_info->trans_lock); |
| 1399 | 1462 | ||
| @@ -1427,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, | |||
| 1427 | } | 1490 | } |
| 1428 | 1491 | ||
| 1429 | if (flush_on_commit || snap_pending) { | 1492 | if (flush_on_commit || snap_pending) { |
| 1430 | btrfs_start_delalloc_inodes(root, 1); | 1493 | ret = btrfs_start_delalloc_inodes(root, 1); |
| 1494 | if (ret) | ||
| 1495 | return ret; | ||
| 1431 | btrfs_wait_ordered_extents(root, 1); | 1496 | btrfs_wait_ordered_extents(root, 1); |
| 1432 | } | 1497 | } |
| 1433 | 1498 | ||
| @@ -1449,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans, | |||
| 1449 | * it here and no for sure that nothing new will be added | 1514 | * it here and no for sure that nothing new will be added |
| 1450 | * to the list | 1515 | * to the list |
| 1451 | */ | 1516 | */ |
| 1452 | btrfs_run_ordered_operations(root, 1); | 1517 | ret = btrfs_run_ordered_operations(trans, root, 1); |
| 1453 | 1518 | ||
| 1454 | return 0; | 1519 | return ret; |
| 1455 | } | 1520 | } |
| 1456 | 1521 | ||
| 1457 | /* | 1522 | /* |
| @@ -1472,27 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
| 1472 | int should_grow = 0; | 1537 | int should_grow = 0; |
| 1473 | unsigned long now = get_seconds(); | 1538 | unsigned long now = get_seconds(); |
| 1474 | 1539 | ||
| 1475 | ret = btrfs_run_ordered_operations(root, 0); | 1540 | ret = btrfs_run_ordered_operations(trans, root, 0); |
| 1476 | if (ret) { | 1541 | if (ret) { |
| 1477 | btrfs_abort_transaction(trans, root, ret); | 1542 | btrfs_abort_transaction(trans, root, ret); |
| 1478 | goto cleanup_transaction; | 1543 | btrfs_end_transaction(trans, root); |
| 1544 | return ret; | ||
| 1479 | } | 1545 | } |
| 1480 | 1546 | ||
| 1481 | /* Stop the commit early if ->aborted is set */ | 1547 | /* Stop the commit early if ->aborted is set */ |
| 1482 | if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { | 1548 | if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { |
| 1483 | ret = cur_trans->aborted; | 1549 | ret = cur_trans->aborted; |
| 1484 | goto cleanup_transaction; | 1550 | btrfs_end_transaction(trans, root); |
| 1551 | return ret; | ||
| 1485 | } | 1552 | } |
| 1486 | 1553 | ||
| 1487 | /* make a pass through all the delayed refs we have so far | 1554 | /* make a pass through all the delayed refs we have so far |
| 1488 | * any runnings procs may add more while we are here | 1555 | * any runnings procs may add more while we are here |
| 1489 | */ | 1556 | */ |
| 1490 | ret = btrfs_run_delayed_refs(trans, root, 0); | 1557 | ret = btrfs_run_delayed_refs(trans, root, 0); |
| 1491 | if (ret) | 1558 | if (ret) { |
| 1492 | goto cleanup_transaction; | 1559 | btrfs_end_transaction(trans, root); |
| 1560 | return ret; | ||
| 1561 | } | ||
| 1493 | 1562 | ||
| 1494 | btrfs_trans_release_metadata(trans, root); | 1563 | btrfs_trans_release_metadata(trans, root); |
| 1495 | trans->block_rsv = NULL; | 1564 | trans->block_rsv = NULL; |
| 1565 | if (trans->qgroup_reserved) { | ||
| 1566 | btrfs_qgroup_free(root, trans->qgroup_reserved); | ||
| 1567 | trans->qgroup_reserved = 0; | ||
| 1568 | } | ||
| 1496 | 1569 | ||
| 1497 | cur_trans = trans->transaction; | 1570 | cur_trans = trans->transaction; |
| 1498 | 1571 | ||
| @@ -1506,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
| 1506 | btrfs_create_pending_block_groups(trans, root); | 1579 | btrfs_create_pending_block_groups(trans, root); |
| 1507 | 1580 | ||
| 1508 | ret = btrfs_run_delayed_refs(trans, root, 0); | 1581 | ret = btrfs_run_delayed_refs(trans, root, 0); |
| 1509 | if (ret) | 1582 | if (ret) { |
| 1510 | goto cleanup_transaction; | 1583 | btrfs_end_transaction(trans, root); |
| 1584 | return ret; | ||
| 1585 | } | ||
| 1511 | 1586 | ||
| 1512 | spin_lock(&cur_trans->commit_lock); | 1587 | spin_lock(&cur_trans->commit_lock); |
| 1513 | if (cur_trans->in_commit) { | 1588 | if (cur_trans->in_commit) { |
| @@ -1771,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | |||
| 1771 | cleanup_transaction: | 1846 | cleanup_transaction: |
| 1772 | btrfs_trans_release_metadata(trans, root); | 1847 | btrfs_trans_release_metadata(trans, root); |
| 1773 | trans->block_rsv = NULL; | 1848 | trans->block_rsv = NULL; |
| 1849 | if (trans->qgroup_reserved) { | ||
| 1850 | btrfs_qgroup_free(root, trans->qgroup_reserved); | ||
| 1851 | trans->qgroup_reserved = 0; | ||
| 1852 | } | ||
| 1774 | btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); | 1853 | btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n"); |
| 1775 | // WARN_ON(1); | 1854 | // WARN_ON(1); |
| 1776 | if (current->journal_info == trans) | 1855 | if (current->journal_info == trans) |
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0e8aa1e6c287..3c8e0d25c8e4 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h | |||
| @@ -43,6 +43,7 @@ struct btrfs_transaction { | |||
| 43 | wait_queue_head_t writer_wait; | 43 | wait_queue_head_t writer_wait; |
| 44 | wait_queue_head_t commit_wait; | 44 | wait_queue_head_t commit_wait; |
| 45 | struct list_head pending_snapshots; | 45 | struct list_head pending_snapshots; |
| 46 | struct list_head ordered_operations; | ||
| 46 | struct btrfs_delayed_ref_root delayed_refs; | 47 | struct btrfs_delayed_ref_root delayed_refs; |
| 47 | int aborted; | 48 | int aborted; |
| 48 | }; | 49 | }; |
| @@ -68,6 +69,7 @@ struct btrfs_trans_handle { | |||
| 68 | struct btrfs_block_rsv *orig_rsv; | 69 | struct btrfs_block_rsv *orig_rsv; |
| 69 | short aborted; | 70 | short aborted; |
| 70 | short adding_csums; | 71 | short adding_csums; |
| 72 | bool allocating_chunk; | ||
| 71 | enum btrfs_trans_type type; | 73 | enum btrfs_trans_type type; |
| 72 | /* | 74 | /* |
| 73 | * this root is only needed to validate that the root passed to | 75 | * this root is only needed to validate that the root passed to |
| @@ -82,11 +84,13 @@ struct btrfs_trans_handle { | |||
| 82 | 84 | ||
| 83 | struct btrfs_pending_snapshot { | 85 | struct btrfs_pending_snapshot { |
| 84 | struct dentry *dentry; | 86 | struct dentry *dentry; |
| 87 | struct inode *dir; | ||
| 85 | struct btrfs_root *root; | 88 | struct btrfs_root *root; |
| 86 | struct btrfs_root *snap; | 89 | struct btrfs_root *snap; |
| 87 | struct btrfs_qgroup_inherit *inherit; | 90 | struct btrfs_qgroup_inherit *inherit; |
| 88 | /* block reservation for the operation */ | 91 | /* block reservation for the operation */ |
| 89 | struct btrfs_block_rsv block_rsv; | 92 | struct btrfs_block_rsv block_rsv; |
| 93 | u64 qgroup_reserved; | ||
| 90 | /* extra metadata reseration for relocation */ | 94 | /* extra metadata reseration for relocation */ |
| 91 | int error; | 95 | int error; |
| 92 | bool readonly; | 96 | bool readonly; |
| @@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( | |||
| 110 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); | 114 | struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); |
| 111 | struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); | 115 | struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); |
| 112 | struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); | 116 | struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); |
| 117 | struct btrfs_trans_handle *btrfs_attach_transaction_barrier( | ||
| 118 | struct btrfs_root *root); | ||
| 113 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); | 119 | struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root); |
| 114 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); | 120 | int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid); |
| 115 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, | 121 | int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, |
| 116 | struct btrfs_root *root); | 122 | struct btrfs_root *root); |
| 117 | 123 | ||
| 118 | int btrfs_add_dead_root(struct btrfs_root *root); | 124 | int btrfs_add_dead_root(struct btrfs_root *root); |
| 119 | int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); | 125 | int btrfs_defrag_root(struct btrfs_root *root); |
| 120 | int btrfs_clean_old_snapshots(struct btrfs_root *root); | 126 | int btrfs_clean_old_snapshots(struct btrfs_root *root); |
| 121 | int btrfs_commit_transaction(struct btrfs_trans_handle *trans, | 127 | int btrfs_commit_transaction(struct btrfs_trans_handle *trans, |
| 122 | struct btrfs_root *root); | 128 | struct btrfs_root *root); |
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index 3b580ee8ab1d..94e05c1f118a 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c | |||
| @@ -23,13 +23,14 @@ | |||
| 23 | #include "transaction.h" | 23 | #include "transaction.h" |
| 24 | #include "locking.h" | 24 | #include "locking.h" |
| 25 | 25 | ||
| 26 | /* defrag all the leaves in a given btree. If cache_only == 1, don't read | 26 | /* |
| 27 | * things from disk, otherwise read all the leaves and try to get key order to | 27 | * Defrag all the leaves in a given btree. |
| 28 | * Read all the leaves and try to get key order to | ||
| 28 | * better reflect disk order | 29 | * better reflect disk order |
| 29 | */ | 30 | */ |
| 30 | 31 | ||
| 31 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | 32 | int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, |
| 32 | struct btrfs_root *root, int cache_only) | 33 | struct btrfs_root *root) |
| 33 | { | 34 | { |
| 34 | struct btrfs_path *path = NULL; | 35 | struct btrfs_path *path = NULL; |
| 35 | struct btrfs_key key; | 36 | struct btrfs_key key; |
| @@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | |||
| 41 | u64 last_ret = 0; | 42 | u64 last_ret = 0; |
| 42 | u64 min_trans = 0; | 43 | u64 min_trans = 0; |
| 43 | 44 | ||
| 44 | if (cache_only) | ||
| 45 | goto out; | ||
| 46 | |||
| 47 | if (root->fs_info->extent_root == root) { | 45 | if (root->fs_info->extent_root == root) { |
| 48 | /* | 46 | /* |
| 49 | * there's recursion here right now in the tree locking, | 47 | * there's recursion here right now in the tree locking, |
| @@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | |||
| 86 | } | 84 | } |
| 87 | 85 | ||
| 88 | path->keep_locks = 1; | 86 | path->keep_locks = 1; |
| 89 | if (cache_only) | ||
| 90 | min_trans = root->defrag_trans_start; | ||
| 91 | 87 | ||
| 92 | ret = btrfs_search_forward(root, &key, NULL, path, | 88 | ret = btrfs_search_forward(root, &key, NULL, path, min_trans); |
| 93 | cache_only, min_trans); | ||
| 94 | if (ret < 0) | 89 | if (ret < 0) |
| 95 | goto out; | 90 | goto out; |
| 96 | if (ret > 0) { | 91 | if (ret > 0) { |
| @@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, | |||
| 109 | goto out; | 104 | goto out; |
| 110 | } | 105 | } |
| 111 | path->slots[1] = btrfs_header_nritems(path->nodes[1]); | 106 | path->slots[1] = btrfs_header_nritems(path->nodes[1]); |
| 112 | next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, | 107 | next_key_ret = btrfs_find_next_key(root, path, &key, 1, |
| 113 | min_trans); | 108 | min_trans); |
| 114 | ret = btrfs_realloc_node(trans, root, | 109 | ret = btrfs_realloc_node(trans, root, |
| 115 | path->nodes[1], 0, | 110 | path->nodes[1], 0, |
| 116 | cache_only, &last_ret, | 111 | &last_ret, |
| 117 | &root->defrag_progress); | 112 | &root->defrag_progress); |
| 118 | if (ret) { | 113 | if (ret) { |
| 119 | WARN_ON(ret == -EAGAIN); | 114 | WARN_ON(ret == -EAGAIN); |
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 9027bb1e7466..c7ef569eb22a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c | |||
| @@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log, | |||
| 278 | struct walk_control *wc, u64 gen) | 278 | struct walk_control *wc, u64 gen) |
| 279 | { | 279 | { |
| 280 | if (wc->pin) | 280 | if (wc->pin) |
| 281 | btrfs_pin_extent_for_log_replay(wc->trans, | 281 | btrfs_pin_extent_for_log_replay(log->fs_info->extent_root, |
| 282 | log->fs_info->extent_root, | ||
| 283 | eb->start, eb->len); | 282 | eb->start, eb->len); |
| 284 | 283 | ||
| 285 | if (btrfs_buffer_uptodate(eb, gen, 0)) { | 284 | if (btrfs_buffer_uptodate(eb, gen, 0)) { |
| @@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, | |||
| 485 | struct btrfs_key *key) | 484 | struct btrfs_key *key) |
| 486 | { | 485 | { |
| 487 | int found_type; | 486 | int found_type; |
| 488 | u64 mask = root->sectorsize - 1; | ||
| 489 | u64 extent_end; | 487 | u64 extent_end; |
| 490 | u64 start = key->offset; | 488 | u64 start = key->offset; |
| 491 | u64 saved_nbytes; | 489 | u64 saved_nbytes; |
| @@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, | |||
| 502 | extent_end = start + btrfs_file_extent_num_bytes(eb, item); | 500 | extent_end = start + btrfs_file_extent_num_bytes(eb, item); |
| 503 | else if (found_type == BTRFS_FILE_EXTENT_INLINE) { | 501 | else if (found_type == BTRFS_FILE_EXTENT_INLINE) { |
| 504 | size = btrfs_file_extent_inline_len(eb, item); | 502 | size = btrfs_file_extent_inline_len(eb, item); |
| 505 | extent_end = (start + size + mask) & ~mask; | 503 | extent_end = ALIGN(start + size, root->sectorsize); |
| 506 | } else { | 504 | } else { |
| 507 | ret = 0; | 505 | ret = 0; |
| 508 | goto out; | 506 | goto out; |
| @@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
| 2281 | unsigned long log_transid = 0; | 2279 | unsigned long log_transid = 0; |
| 2282 | 2280 | ||
| 2283 | mutex_lock(&root->log_mutex); | 2281 | mutex_lock(&root->log_mutex); |
| 2282 | log_transid = root->log_transid; | ||
| 2284 | index1 = root->log_transid % 2; | 2283 | index1 = root->log_transid % 2; |
| 2285 | if (atomic_read(&root->log_commit[index1])) { | 2284 | if (atomic_read(&root->log_commit[index1])) { |
| 2286 | wait_log_commit(trans, root, root->log_transid); | 2285 | wait_log_commit(trans, root, root->log_transid); |
| @@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
| 2308 | /* bail out if we need to do a full commit */ | 2307 | /* bail out if we need to do a full commit */ |
| 2309 | if (root->fs_info->last_trans_log_full_commit == trans->transid) { | 2308 | if (root->fs_info->last_trans_log_full_commit == trans->transid) { |
| 2310 | ret = -EAGAIN; | 2309 | ret = -EAGAIN; |
| 2310 | btrfs_free_logged_extents(log, log_transid); | ||
| 2311 | mutex_unlock(&root->log_mutex); | 2311 | mutex_unlock(&root->log_mutex); |
| 2312 | goto out; | 2312 | goto out; |
| 2313 | } | 2313 | } |
| 2314 | 2314 | ||
| 2315 | log_transid = root->log_transid; | ||
| 2316 | if (log_transid % 2 == 0) | 2315 | if (log_transid % 2 == 0) |
| 2317 | mark = EXTENT_DIRTY; | 2316 | mark = EXTENT_DIRTY; |
| 2318 | else | 2317 | else |
| @@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
| 2324 | ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); | 2323 | ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); |
| 2325 | if (ret) { | 2324 | if (ret) { |
| 2326 | btrfs_abort_transaction(trans, root, ret); | 2325 | btrfs_abort_transaction(trans, root, ret); |
| 2326 | btrfs_free_logged_extents(log, log_transid); | ||
| 2327 | mutex_unlock(&root->log_mutex); | 2327 | mutex_unlock(&root->log_mutex); |
| 2328 | goto out; | 2328 | goto out; |
| 2329 | } | 2329 | } |
| @@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
| 2363 | } | 2363 | } |
| 2364 | root->fs_info->last_trans_log_full_commit = trans->transid; | 2364 | root->fs_info->last_trans_log_full_commit = trans->transid; |
| 2365 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2365 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
| 2366 | btrfs_free_logged_extents(log, log_transid); | ||
| 2366 | mutex_unlock(&log_root_tree->log_mutex); | 2367 | mutex_unlock(&log_root_tree->log_mutex); |
| 2367 | ret = -EAGAIN; | 2368 | ret = -EAGAIN; |
| 2368 | goto out; | 2369 | goto out; |
| @@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
| 2373 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2374 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
| 2374 | wait_log_commit(trans, log_root_tree, | 2375 | wait_log_commit(trans, log_root_tree, |
| 2375 | log_root_tree->log_transid); | 2376 | log_root_tree->log_transid); |
| 2377 | btrfs_free_logged_extents(log, log_transid); | ||
| 2376 | mutex_unlock(&log_root_tree->log_mutex); | 2378 | mutex_unlock(&log_root_tree->log_mutex); |
| 2377 | ret = 0; | 2379 | ret = 0; |
| 2378 | goto out; | 2380 | goto out; |
| @@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
| 2392 | */ | 2394 | */ |
| 2393 | if (root->fs_info->last_trans_log_full_commit == trans->transid) { | 2395 | if (root->fs_info->last_trans_log_full_commit == trans->transid) { |
| 2394 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2396 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
| 2397 | btrfs_free_logged_extents(log, log_transid); | ||
| 2395 | mutex_unlock(&log_root_tree->log_mutex); | 2398 | mutex_unlock(&log_root_tree->log_mutex); |
| 2396 | ret = -EAGAIN; | 2399 | ret = -EAGAIN; |
| 2397 | goto out_wake_log_root; | 2400 | goto out_wake_log_root; |
| @@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, | |||
| 2402 | EXTENT_DIRTY | EXTENT_NEW); | 2405 | EXTENT_DIRTY | EXTENT_NEW); |
| 2403 | if (ret) { | 2406 | if (ret) { |
| 2404 | btrfs_abort_transaction(trans, root, ret); | 2407 | btrfs_abort_transaction(trans, root, ret); |
| 2408 | btrfs_free_logged_extents(log, log_transid); | ||
| 2405 | mutex_unlock(&log_root_tree->log_mutex); | 2409 | mutex_unlock(&log_root_tree->log_mutex); |
| 2406 | goto out_wake_log_root; | 2410 | goto out_wake_log_root; |
| 2407 | } | 2411 | } |
| 2408 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); | 2412 | btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); |
| 2413 | btrfs_wait_logged_extents(log, log_transid); | ||
| 2409 | 2414 | ||
| 2410 | btrfs_set_super_log_root(root->fs_info->super_for_commit, | 2415 | btrfs_set_super_log_root(root->fs_info->super_for_commit, |
| 2411 | log_root_tree->node->start); | 2416 | log_root_tree->node->start); |
| @@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans, | |||
| 2461 | .process_func = process_one_buffer | 2466 | .process_func = process_one_buffer |
| 2462 | }; | 2467 | }; |
| 2463 | 2468 | ||
| 2464 | ret = walk_log_tree(trans, log, &wc); | 2469 | if (trans) { |
| 2465 | BUG_ON(ret); | 2470 | ret = walk_log_tree(trans, log, &wc); |
| 2471 | BUG_ON(ret); | ||
| 2472 | } | ||
| 2466 | 2473 | ||
| 2467 | while (1) { | 2474 | while (1) { |
| 2468 | ret = find_first_extent_bit(&log->dirty_log_pages, | 2475 | ret = find_first_extent_bit(&log->dirty_log_pages, |
| @@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, | |||
| 2475 | EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); | 2482 | EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); |
| 2476 | } | 2483 | } |
| 2477 | 2484 | ||
| 2485 | /* | ||
| 2486 | * We may have short-circuited the log tree with the full commit logic | ||
| 2487 | * and left ordered extents on our list, so clear these out to keep us | ||
| 2488 | * from leaking inodes and memory. | ||
| 2489 | */ | ||
| 2490 | btrfs_free_logged_extents(log, 0); | ||
| 2491 | btrfs_free_logged_extents(log, 1); | ||
| 2492 | |||
| 2478 | free_extent_buffer(log->node); | 2493 | free_extent_buffer(log->node); |
| 2479 | kfree(log); | 2494 | kfree(log); |
| 2480 | } | 2495 | } |
| @@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, | |||
| 2724 | path->keep_locks = 1; | 2739 | path->keep_locks = 1; |
| 2725 | 2740 | ||
| 2726 | ret = btrfs_search_forward(root, &min_key, &max_key, | 2741 | ret = btrfs_search_forward(root, &min_key, &max_key, |
| 2727 | path, 0, trans->transid); | 2742 | path, trans->transid); |
| 2728 | 2743 | ||
| 2729 | /* | 2744 | /* |
| 2730 | * we didn't find anything from this transaction, see if there | 2745 | * we didn't find anything from this transaction, see if there |
| @@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans, | |||
| 3271 | struct btrfs_root *log = root->log_root; | 3286 | struct btrfs_root *log = root->log_root; |
| 3272 | struct btrfs_file_extent_item *fi; | 3287 | struct btrfs_file_extent_item *fi; |
| 3273 | struct extent_buffer *leaf; | 3288 | struct extent_buffer *leaf; |
| 3289 | struct btrfs_ordered_extent *ordered; | ||
| 3274 | struct list_head ordered_sums; | 3290 | struct list_head ordered_sums; |
| 3275 | struct btrfs_map_token token; | 3291 | struct btrfs_map_token token; |
| 3276 | struct btrfs_key key; | 3292 | struct btrfs_key key; |
| 3277 | u64 csum_offset = em->mod_start - em->start; | 3293 | u64 mod_start = em->mod_start; |
| 3278 | u64 csum_len = em->mod_len; | 3294 | u64 mod_len = em->mod_len; |
| 3295 | u64 csum_offset; | ||
| 3296 | u64 csum_len; | ||
| 3279 | u64 extent_offset = em->start - em->orig_start; | 3297 | u64 extent_offset = em->start - em->orig_start; |
| 3280 | u64 block_len; | 3298 | u64 block_len; |
| 3281 | int ret; | 3299 | int ret; |
| 3300 | int index = log->log_transid % 2; | ||
| 3282 | bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; | 3301 | bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; |
| 3283 | 3302 | ||
| 3303 | insert: | ||
| 3284 | INIT_LIST_HEAD(&ordered_sums); | 3304 | INIT_LIST_HEAD(&ordered_sums); |
| 3285 | btrfs_init_map_token(&token); | 3305 | btrfs_init_map_token(&token); |
| 3286 | key.objectid = btrfs_ino(inode); | 3306 | key.objectid = btrfs_ino(inode); |
| @@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans, | |||
| 3296 | leaf = path->nodes[0]; | 3316 | leaf = path->nodes[0]; |
| 3297 | fi = btrfs_item_ptr(leaf, path->slots[0], | 3317 | fi = btrfs_item_ptr(leaf, path->slots[0], |
| 3298 | struct btrfs_file_extent_item); | 3318 | struct btrfs_file_extent_item); |
| 3319 | |||
| 3320 | /* | ||
| 3321 | * If we are overwriting an inline extent with a real one then we need | ||
| 3322 | * to just delete the inline extent as it may not be large enough to | ||
| 3323 | * have the entire file_extent_item. | ||
| 3324 | */ | ||
| 3325 | if (ret && btrfs_token_file_extent_type(leaf, fi, &token) == | ||
| 3326 | BTRFS_FILE_EXTENT_INLINE) { | ||
| 3327 | ret = btrfs_del_item(trans, log, path); | ||
| 3328 | btrfs_release_path(path); | ||
| 3329 | if (ret) { | ||
| 3330 | path->really_keep_locks = 0; | ||
| 3331 | return ret; | ||
| 3332 | } | ||
| 3333 | goto insert; | ||
| 3334 | } | ||
| 3335 | |||
| 3299 | btrfs_set_token_file_extent_generation(leaf, fi, em->generation, | 3336 | btrfs_set_token_file_extent_generation(leaf, fi, em->generation, |
| 3300 | &token); | 3337 | &token); |
| 3301 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { | 3338 | if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { |
| @@ -3362,6 +3399,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans, | |||
| 3362 | csum_len = block_len; | 3399 | csum_len = block_len; |
| 3363 | } | 3400 | } |
| 3364 | 3401 | ||
| 3402 | /* | ||
| 3403 | * First check and see if our csums are on our outstanding ordered | ||
| 3404 | * extents. | ||
| 3405 | */ | ||
| 3406 | again: | ||
| 3407 | spin_lock_irq(&log->log_extents_lock[index]); | ||
| 3408 | list_for_each_entry(ordered, &log->logged_list[index], log_list) { | ||
| 3409 | struct btrfs_ordered_sum *sum; | ||
| 3410 | |||
| 3411 | if (!mod_len) | ||
| 3412 | break; | ||
| 3413 | |||
| 3414 | if (ordered->inode != inode) | ||
| 3415 | continue; | ||
| 3416 | |||
| 3417 | if (ordered->file_offset + ordered->len <= mod_start || | ||
| 3418 | mod_start + mod_len <= ordered->file_offset) | ||
| 3419 | continue; | ||
| 3420 | |||
| 3421 | /* | ||
| 3422 | * We are going to copy all the csums on this ordered extent, so | ||
| 3423 | * go ahead and adjust mod_start and mod_len in case this | ||
| 3424 | * ordered extent has already been logged. | ||
| 3425 | */ | ||
| 3426 | if (ordered->file_offset > mod_start) { | ||
| 3427 | if (ordered->file_offset + ordered->len >= | ||
| 3428 | mod_start + mod_len) | ||
| 3429 | mod_len = ordered->file_offset - mod_start; | ||
| 3430 | /* | ||
| 3431 | * If we have this case | ||
| 3432 | * | ||
| 3433 | * |--------- logged extent ---------| | ||
| 3434 | * |----- ordered extent ----| | ||
| 3435 | * | ||
| 3436 | * Just don't mess with mod_start and mod_len, we'll | ||
| 3437 | * just end up logging more csums than we need and it | ||
| 3438 | * will be ok. | ||
| 3439 | */ | ||
| 3440 | } else { | ||
| 3441 | if (ordered->file_offset + ordered->len < | ||
| 3442 | mod_start + mod_len) { | ||
| 3443 | mod_len = (mod_start + mod_len) - | ||
| 3444 | (ordered->file_offset + ordered->len); | ||
| 3445 | mod_start = ordered->file_offset + | ||
| 3446 | ordered->len; | ||
| 3447 | } else { | ||
| 3448 | mod_len = 0; | ||
| 3449 | } | ||
| 3450 | } | ||
| 3451 | |||
| 3452 | /* | ||
| 3453 | * To keep us from looping for the above case of an ordered | ||
| 3454 | * extent that falls inside of the logged extent. | ||
| 3455 | */ | ||
| 3456 | if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, | ||
| 3457 | &ordered->flags)) | ||
| 3458 | continue; | ||
| 3459 | atomic_inc(&ordered->refs); | ||
| 3460 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
| 3461 | /* | ||
| 3462 | * we've dropped the lock, we must either break or | ||
| 3463 | * start over after this. | ||
| 3464 | */ | ||
| 3465 | |||
| 3466 | wait_event(ordered->wait, ordered->csum_bytes_left == 0); | ||
| 3467 | |||
| 3468 | list_for_each_entry(sum, &ordered->list, list) { | ||
| 3469 | ret = btrfs_csum_file_blocks(trans, log, sum); | ||
| 3470 | if (ret) { | ||
| 3471 | btrfs_put_ordered_extent(ordered); | ||
| 3472 | goto unlocked; | ||
| 3473 | } | ||
| 3474 | } | ||
| 3475 | btrfs_put_ordered_extent(ordered); | ||
| 3476 | goto again; | ||
| 3477 | |||
| 3478 | } | ||
| 3479 | spin_unlock_irq(&log->log_extents_lock[index]); | ||
| 3480 | unlocked: | ||
| 3481 | |||
| 3482 | if (!mod_len || ret) | ||
| 3483 | return ret; | ||
| 3484 | |||
| 3485 | csum_offset = mod_start - em->start; | ||
| 3486 | csum_len = mod_len; | ||
| 3487 | |||
| 3365 | /* block start is already adjusted for the file extent offset. */ | 3488 | /* block start is already adjusted for the file extent offset. */ |
| 3366 | ret = btrfs_lookup_csums_range(log->fs_info->csum_root, | 3489 | ret = btrfs_lookup_csums_range(log->fs_info->csum_root, |
| 3367 | em->block_start + csum_offset, | 3490 | em->block_start + csum_offset, |
| @@ -3393,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, | |||
| 3393 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; | 3516 | struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; |
| 3394 | u64 test_gen; | 3517 | u64 test_gen; |
| 3395 | int ret = 0; | 3518 | int ret = 0; |
| 3519 | int num = 0; | ||
| 3396 | 3520 | ||
| 3397 | INIT_LIST_HEAD(&extents); | 3521 | INIT_LIST_HEAD(&extents); |
| 3398 | 3522 | ||
| @@ -3401,16 +3525,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, | |||
| 3401 | 3525 | ||
| 3402 | list_for_each_entry_safe(em, n, &tree->modified_extents, list) { | 3526 | list_for_each_entry_safe(em, n, &tree->modified_extents, list) { |
| 3403 | list_del_init(&em->list); | 3527 | list_del_init(&em->list); |
| 3528 | |||
| 3529 | /* | ||
| 3530 | * Just an arbitrary number, this can be really CPU intensive | ||
| 3531 | * once we start getting a lot of extents, and really once we | ||
| 3532 | * have a bunch of extents we just want to commit since it will | ||
| 3533 | * be faster. | ||
| 3534 | */ | ||
| 3535 | if (++num > 32768) { | ||
| 3536 | list_del_init(&tree->modified_extents); | ||
| 3537 | ret = -EFBIG; | ||
| 3538 | goto process; | ||
| 3539 | } | ||
| 3540 | |||
| 3404 | if (em->generation <= test_gen) | 3541 | if (em->generation <= test_gen) |
| 3405 | continue; | 3542 | continue; |
| 3406 | /* Need a ref to keep it from getting evicted from cache */ | 3543 | /* Need a ref to keep it from getting evicted from cache */ |
| 3407 | atomic_inc(&em->refs); | 3544 | atomic_inc(&em->refs); |
| 3408 | set_bit(EXTENT_FLAG_LOGGING, &em->flags); | 3545 | set_bit(EXTENT_FLAG_LOGGING, &em->flags); |
| 3409 | list_add_tail(&em->list, &extents); | 3546 | list_add_tail(&em->list, &extents); |
| 3547 | num++; | ||
| 3410 | } | 3548 | } |
| 3411 | 3549 | ||
| 3412 | list_sort(NULL, &extents, extent_cmp); | 3550 | list_sort(NULL, &extents, extent_cmp); |
| 3413 | 3551 | ||
| 3552 | process: | ||
| 3414 | while (!list_empty(&extents)) { | 3553 | while (!list_empty(&extents)) { |
| 3415 | em = list_entry(extents.next, struct extent_map, list); | 3554 | em = list_entry(extents.next, struct extent_map, list); |
| 3416 | 3555 | ||
| @@ -3513,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
| 3513 | 3652 | ||
| 3514 | mutex_lock(&BTRFS_I(inode)->log_mutex); | 3653 | mutex_lock(&BTRFS_I(inode)->log_mutex); |
| 3515 | 3654 | ||
| 3655 | btrfs_get_logged_extents(log, inode); | ||
| 3656 | |||
| 3516 | /* | 3657 | /* |
| 3517 | * a brute force approach to making sure we get the most uptodate | 3658 | * a brute force approach to making sure we get the most uptodate |
| 3518 | * copies of everything. | 3659 | * copies of everything. |
| @@ -3558,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, | |||
| 3558 | while (1) { | 3699 | while (1) { |
| 3559 | ins_nr = 0; | 3700 | ins_nr = 0; |
| 3560 | ret = btrfs_search_forward(root, &min_key, &max_key, | 3701 | ret = btrfs_search_forward(root, &min_key, &max_key, |
| 3561 | path, 0, trans->transid); | 3702 | path, trans->transid); |
| 3562 | if (ret != 0) | 3703 | if (ret != 0) |
| 3563 | break; | 3704 | break; |
| 3564 | again: | 3705 | again: |
| @@ -3656,6 +3797,8 @@ log_extents: | |||
| 3656 | BTRFS_I(inode)->logged_trans = trans->transid; | 3797 | BTRFS_I(inode)->logged_trans = trans->transid; |
| 3657 | BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; | 3798 | BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; |
| 3658 | out_unlock: | 3799 | out_unlock: |
| 3800 | if (err) | ||
| 3801 | btrfs_free_logged_extents(log, log->log_transid); | ||
| 3659 | mutex_unlock(&BTRFS_I(inode)->log_mutex); | 3802 | mutex_unlock(&BTRFS_I(inode)->log_mutex); |
| 3660 | 3803 | ||
| 3661 | btrfs_free_path(path); | 3804 | btrfs_free_path(path); |
| @@ -3822,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, | |||
| 3822 | end_trans: | 3965 | end_trans: |
| 3823 | dput(old_parent); | 3966 | dput(old_parent); |
| 3824 | if (ret < 0) { | 3967 | if (ret < 0) { |
| 3825 | WARN_ON(ret != -ENOSPC); | ||
| 3826 | root->fs_info->last_trans_log_full_commit = trans->transid; | 3968 | root->fs_info->last_trans_log_full_commit = trans->transid; |
| 3827 | ret = 1; | 3969 | ret = 1; |
| 3828 | } | 3970 | } |
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 99be4c138db6..ddc61cad0080 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c | |||
| @@ -5,7 +5,7 @@ | |||
| 5 | */ | 5 | */ |
| 6 | 6 | ||
| 7 | #include <linux/slab.h> | 7 | #include <linux/slab.h> |
| 8 | #include <linux/module.h> | 8 | #include <linux/export.h> |
| 9 | #include "ulist.h" | 9 | #include "ulist.h" |
| 10 | 10 | ||
| 11 | /* | 11 | /* |
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cbb7f4b1672..35bb2d4ed29f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c | |||
| @@ -25,6 +25,8 @@ | |||
| 25 | #include <linux/capability.h> | 25 | #include <linux/capability.h> |
| 26 | #include <linux/ratelimit.h> | 26 | #include <linux/ratelimit.h> |
| 27 | #include <linux/kthread.h> | 27 | #include <linux/kthread.h> |
| 28 | #include <linux/raid/pq.h> | ||
| 29 | #include <asm/div64.h> | ||
| 28 | #include "compat.h" | 30 | #include "compat.h" |
| 29 | #include "ctree.h" | 31 | #include "ctree.h" |
| 30 | #include "extent_map.h" | 32 | #include "extent_map.h" |
| @@ -32,6 +34,7 @@ | |||
| 32 | #include "transaction.h" | 34 | #include "transaction.h" |
| 33 | #include "print-tree.h" | 35 | #include "print-tree.h" |
| 34 | #include "volumes.h" | 36 | #include "volumes.h" |
| 37 | #include "raid56.h" | ||
| 35 | #include "async-thread.h" | 38 | #include "async-thread.h" |
| 36 | #include "check-integrity.h" | 39 | #include "check-integrity.h" |
| 37 | #include "rcu-string.h" | 40 | #include "rcu-string.h" |
| @@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) | |||
| 647 | new_device->writeable = 0; | 650 | new_device->writeable = 0; |
| 648 | new_device->in_fs_metadata = 0; | 651 | new_device->in_fs_metadata = 0; |
| 649 | new_device->can_discard = 0; | 652 | new_device->can_discard = 0; |
| 653 | spin_lock_init(&new_device->io_lock); | ||
| 650 | list_replace_rcu(&device->dev_list, &new_device->dev_list); | 654 | list_replace_rcu(&device->dev_list, &new_device->dev_list); |
| 651 | 655 | ||
| 652 | call_rcu(&device->rcu, free_device); | 656 | call_rcu(&device->rcu, free_device); |
| @@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, | |||
| 792 | return ret; | 796 | return ret; |
| 793 | } | 797 | } |
| 794 | 798 | ||
| 799 | /* | ||
| 800 | * Look for a btrfs signature on a device. This may be called out of the mount path | ||
| 801 | * and we are not allowed to call set_blocksize during the scan. The superblock | ||
| 802 | * is read via pagecache | ||
| 803 | */ | ||
| 795 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | 804 | int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, |
| 796 | struct btrfs_fs_devices **fs_devices_ret) | 805 | struct btrfs_fs_devices **fs_devices_ret) |
| 797 | { | 806 | { |
| 798 | struct btrfs_super_block *disk_super; | 807 | struct btrfs_super_block *disk_super; |
| 799 | struct block_device *bdev; | 808 | struct block_device *bdev; |
| 800 | struct buffer_head *bh; | 809 | struct page *page; |
| 801 | int ret; | 810 | void *p; |
| 811 | int ret = -EINVAL; | ||
| 802 | u64 devid; | 812 | u64 devid; |
| 803 | u64 transid; | 813 | u64 transid; |
| 804 | u64 total_devices; | 814 | u64 total_devices; |
| 815 | u64 bytenr; | ||
| 816 | pgoff_t index; | ||
| 805 | 817 | ||
| 818 | /* | ||
| 819 | * we would like to check all the supers, but that would make | ||
| 820 | * a btrfs mount succeed after a mkfs from a different FS. | ||
| 821 | * So, we need to add a special mount option to scan for | ||
| 822 | * later supers, using BTRFS_SUPER_MIRROR_MAX instead | ||
| 823 | */ | ||
| 824 | bytenr = btrfs_sb_offset(0); | ||
| 806 | flags |= FMODE_EXCL; | 825 | flags |= FMODE_EXCL; |
| 807 | mutex_lock(&uuid_mutex); | 826 | mutex_lock(&uuid_mutex); |
| 808 | ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh); | 827 | |
| 809 | if (ret) | 828 | bdev = blkdev_get_by_path(path, flags, holder); |
| 829 | |||
| 830 | if (IS_ERR(bdev)) { | ||
| 831 | ret = PTR_ERR(bdev); | ||
| 810 | goto error; | 832 | goto error; |
| 811 | disk_super = (struct btrfs_super_block *)bh->b_data; | 833 | } |
| 834 | |||
| 835 | /* make sure our super fits in the device */ | ||
| 836 | if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode)) | ||
| 837 | goto error_bdev_put; | ||
| 838 | |||
| 839 | /* make sure our super fits in the page */ | ||
| 840 | if (sizeof(*disk_super) > PAGE_CACHE_SIZE) | ||
| 841 | goto error_bdev_put; | ||
| 842 | |||
| 843 | /* make sure our super doesn't straddle pages on disk */ | ||
| 844 | index = bytenr >> PAGE_CACHE_SHIFT; | ||
| 845 | if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index) | ||
| 846 | goto error_bdev_put; | ||
| 847 | |||
| 848 | /* pull in the page with our super */ | ||
| 849 | page = read_cache_page_gfp(bdev->bd_inode->i_mapping, | ||
| 850 | index, GFP_NOFS); | ||
| 851 | |||
| 852 | if (IS_ERR_OR_NULL(page)) | ||
| 853 | goto error_bdev_put; | ||
| 854 | |||
| 855 | p = kmap(page); | ||
| 856 | |||
| 857 | /* align our pointer to the offset of the super block */ | ||
| 858 | disk_super = p + (bytenr & ~PAGE_CACHE_MASK); | ||
| 859 | |||
| 860 | if (btrfs_super_bytenr(disk_super) != bytenr || | ||
| 861 | disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) | ||
| 862 | goto error_unmap; | ||
| 863 | |||
| 812 | devid = btrfs_stack_device_id(&disk_super->dev_item); | 864 | devid = btrfs_stack_device_id(&disk_super->dev_item); |
| 813 | transid = btrfs_super_generation(disk_super); | 865 | transid = btrfs_super_generation(disk_super); |
| 814 | total_devices = btrfs_super_num_devices(disk_super); | 866 | total_devices = btrfs_super_num_devices(disk_super); |
| 867 | |||
| 815 | if (disk_super->label[0]) { | 868 | if (disk_super->label[0]) { |
| 816 | if (disk_super->label[BTRFS_LABEL_SIZE - 1]) | 869 | if (disk_super->label[BTRFS_LABEL_SIZE - 1]) |
| 817 | disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; | 870 | disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0'; |
| @@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, | |||
| 819 | } else { | 872 | } else { |
| 820 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); | 873 | printk(KERN_INFO "device fsid %pU ", disk_super->fsid); |
| 821 | } | 874 | } |
| 875 | |||
| 822 | printk(KERN_CONT "devid %llu transid %llu %s\n", | 876 | printk(KERN_CONT "devid %llu transid %llu %s\n", |
| 823 | (unsigned long long)devid, (unsigned long long)transid, path); | 877 | (unsigned long long)devid, (unsigned long long)transid, path); |
| 878 | |||
| 824 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); | 879 | ret = device_list_add(path, disk_super, devid, fs_devices_ret); |
| 825 | if (!ret && fs_devices_ret) | 880 | if (!ret && fs_devices_ret) |
| 826 | (*fs_devices_ret)->total_devices = total_devices; | 881 | (*fs_devices_ret)->total_devices = total_devices; |
| 827 | brelse(bh); | 882 | |
| 883 | error_unmap: | ||
| 884 | kunmap(page); | ||
| 885 | page_cache_release(page); | ||
| 886 | |||
| 887 | error_bdev_put: | ||
| 828 | blkdev_put(bdev, flags); | 888 | blkdev_put(bdev, flags); |
| 829 | error: | 889 | error: |
| 830 | mutex_unlock(&uuid_mutex); | 890 | mutex_unlock(&uuid_mutex); |
| @@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
| 1372 | u64 devid; | 1432 | u64 devid; |
| 1373 | u64 num_devices; | 1433 | u64 num_devices; |
| 1374 | u8 *dev_uuid; | 1434 | u8 *dev_uuid; |
| 1435 | unsigned seq; | ||
| 1375 | int ret = 0; | 1436 | int ret = 0; |
| 1376 | bool clear_super = false; | 1437 | bool clear_super = false; |
| 1377 | 1438 | ||
| 1378 | mutex_lock(&uuid_mutex); | 1439 | mutex_lock(&uuid_mutex); |
| 1379 | 1440 | ||
| 1380 | all_avail = root->fs_info->avail_data_alloc_bits | | 1441 | do { |
| 1381 | root->fs_info->avail_system_alloc_bits | | 1442 | seq = read_seqbegin(&root->fs_info->profiles_lock); |
| 1382 | root->fs_info->avail_metadata_alloc_bits; | 1443 | |
| 1444 | all_avail = root->fs_info->avail_data_alloc_bits | | ||
| 1445 | root->fs_info->avail_system_alloc_bits | | ||
| 1446 | root->fs_info->avail_metadata_alloc_bits; | ||
| 1447 | } while (read_seqretry(&root->fs_info->profiles_lock, seq)); | ||
| 1383 | 1448 | ||
| 1384 | num_devices = root->fs_info->fs_devices->num_devices; | 1449 | num_devices = root->fs_info->fs_devices->num_devices; |
| 1385 | btrfs_dev_replace_lock(&root->fs_info->dev_replace); | 1450 | btrfs_dev_replace_lock(&root->fs_info->dev_replace); |
| @@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) | |||
| 1403 | goto out; | 1468 | goto out; |
| 1404 | } | 1469 | } |
| 1405 | 1470 | ||
| 1471 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && | ||
| 1472 | root->fs_info->fs_devices->rw_devices <= 2) { | ||
| 1473 | printk(KERN_ERR "btrfs: unable to go below two " | ||
| 1474 | "devices on raid5\n"); | ||
| 1475 | ret = -EINVAL; | ||
| 1476 | goto out; | ||
| 1477 | } | ||
| 1478 | if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) && | ||
| 1479 | root->fs_info->fs_devices->rw_devices <= 3) { | ||
| 1480 | printk(KERN_ERR "btrfs: unable to go below three " | ||
| 1481 | "devices on raid6\n"); | ||
| 1482 | ret = -EINVAL; | ||
| 1483 | goto out; | ||
| 1484 | } | ||
| 1485 | |||
| 1406 | if (strcmp(device_path, "missing") == 0) { | 1486 | if (strcmp(device_path, "missing") == 0) { |
| 1407 | struct list_head *devices; | 1487 | struct list_head *devices; |
| 1408 | struct btrfs_device *tmp; | 1488 | struct btrfs_device *tmp; |
| @@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, | |||
| 2616 | chunk_used = btrfs_block_group_used(&cache->item); | 2696 | chunk_used = btrfs_block_group_used(&cache->item); |
| 2617 | 2697 | ||
| 2618 | if (bargs->usage == 0) | 2698 | if (bargs->usage == 0) |
| 2619 | user_thresh = 0; | 2699 | user_thresh = 1; |
| 2620 | else if (bargs->usage > 100) | 2700 | else if (bargs->usage > 100) |
| 2621 | user_thresh = cache->key.offset; | 2701 | user_thresh = cache->key.offset; |
| 2622 | else | 2702 | else |
| @@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf, | |||
| 2664 | return 0; | 2744 | return 0; |
| 2665 | 2745 | ||
| 2666 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | | 2746 | if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP | |
| 2667 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) | 2747 | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) { |
| 2668 | factor = 2; | 2748 | factor = num_stripes / 2; |
| 2669 | else | 2749 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) { |
| 2670 | factor = 1; | 2750 | factor = num_stripes - 1; |
| 2671 | factor = num_stripes / factor; | 2751 | } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) { |
| 2752 | factor = num_stripes - 2; | ||
| 2753 | } else { | ||
| 2754 | factor = num_stripes; | ||
| 2755 | } | ||
| 2672 | 2756 | ||
| 2673 | for (i = 0; i < num_stripes; i++) { | 2757 | for (i = 0; i < num_stripes; i++) { |
| 2674 | stripe = btrfs_stripe_nr(chunk, i); | 2758 | stripe = btrfs_stripe_nr(chunk, i); |
| @@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
| 2985 | int mixed = 0; | 3069 | int mixed = 0; |
| 2986 | int ret; | 3070 | int ret; |
| 2987 | u64 num_devices; | 3071 | u64 num_devices; |
| 3072 | unsigned seq; | ||
| 2988 | 3073 | ||
| 2989 | if (btrfs_fs_closing(fs_info) || | 3074 | if (btrfs_fs_closing(fs_info) || |
| 2990 | atomic_read(&fs_info->balance_pause_req) || | 3075 | atomic_read(&fs_info->balance_pause_req) || |
| @@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
| 3027 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); | 3112 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); |
| 3028 | else | 3113 | else |
| 3029 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | | 3114 | allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | |
| 3030 | BTRFS_BLOCK_GROUP_RAID10); | 3115 | BTRFS_BLOCK_GROUP_RAID10 | |
| 3116 | BTRFS_BLOCK_GROUP_RAID5 | | ||
| 3117 | BTRFS_BLOCK_GROUP_RAID6); | ||
| 3031 | 3118 | ||
| 3032 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3119 | if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
| 3033 | (!alloc_profile_is_valid(bctl->data.target, 1) || | 3120 | (!alloc_profile_is_valid(bctl->data.target, 1) || |
| @@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
| 3067 | 3154 | ||
| 3068 | /* allow to reduce meta or sys integrity only if force set */ | 3155 | /* allow to reduce meta or sys integrity only if force set */ |
| 3069 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | | 3156 | allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | |
| 3070 | BTRFS_BLOCK_GROUP_RAID10; | 3157 | BTRFS_BLOCK_GROUP_RAID10 | |
| 3071 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3158 | BTRFS_BLOCK_GROUP_RAID5 | |
| 3072 | (fs_info->avail_system_alloc_bits & allowed) && | 3159 | BTRFS_BLOCK_GROUP_RAID6; |
| 3073 | !(bctl->sys.target & allowed)) || | 3160 | do { |
| 3074 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && | 3161 | seq = read_seqbegin(&fs_info->profiles_lock); |
| 3075 | (fs_info->avail_metadata_alloc_bits & allowed) && | 3162 | |
| 3076 | !(bctl->meta.target & allowed))) { | 3163 | if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
| 3077 | if (bctl->flags & BTRFS_BALANCE_FORCE) { | 3164 | (fs_info->avail_system_alloc_bits & allowed) && |
| 3078 | printk(KERN_INFO "btrfs: force reducing metadata " | 3165 | !(bctl->sys.target & allowed)) || |
| 3079 | "integrity\n"); | 3166 | ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && |
| 3080 | } else { | 3167 | (fs_info->avail_metadata_alloc_bits & allowed) && |
| 3081 | printk(KERN_ERR "btrfs: balance will reduce metadata " | 3168 | !(bctl->meta.target & allowed))) { |
| 3082 | "integrity, use force if you want this\n"); | 3169 | if (bctl->flags & BTRFS_BALANCE_FORCE) { |
| 3083 | ret = -EINVAL; | 3170 | printk(KERN_INFO "btrfs: force reducing metadata " |
| 3084 | goto out; | 3171 | "integrity\n"); |
| 3172 | } else { | ||
| 3173 | printk(KERN_ERR "btrfs: balance will reduce metadata " | ||
| 3174 | "integrity, use force if you want this\n"); | ||
| 3175 | ret = -EINVAL; | ||
| 3176 | goto out; | ||
| 3177 | } | ||
| 3085 | } | 3178 | } |
| 3086 | } | 3179 | } while (read_seqretry(&fs_info->profiles_lock, seq)); |
| 3087 | 3180 | ||
| 3088 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { | 3181 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { |
| 3089 | int num_tolerated_disk_barrier_failures; | 3182 | int num_tolerated_disk_barrier_failures; |
| @@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, | |||
| 3127 | mutex_lock(&fs_info->balance_mutex); | 3220 | mutex_lock(&fs_info->balance_mutex); |
| 3128 | atomic_dec(&fs_info->balance_running); | 3221 | atomic_dec(&fs_info->balance_running); |
| 3129 | 3222 | ||
| 3130 | if (bargs) { | ||
| 3131 | memset(bargs, 0, sizeof(*bargs)); | ||
| 3132 | update_ioctl_balance_args(fs_info, 0, bargs); | ||
| 3133 | } | ||
| 3134 | |||
| 3135 | if ((ret && ret != -ECANCELED && ret != -ENOSPC) || | ||
| 3136 | balance_need_close(fs_info)) { | ||
| 3137 | __cancel_balance(fs_info); | ||
| 3138 | } | ||
| 3139 | |||
| 3140 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { | 3223 | if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { |
| 3141 | fs_info->num_tolerated_disk_barrier_failures = | 3224 | fs_info->num_tolerated_disk_barrier_failures = |
| 3142 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); | 3225 | btrfs_calc_num_tolerated_disk_barrier_failures(fs_info); |
| 3143 | } | 3226 | } |
| 3144 | 3227 | ||
| 3228 | if (bargs) { | ||
| 3229 | memset(bargs, 0, sizeof(*bargs)); | ||
| 3230 | update_ioctl_balance_args(fs_info, 0, bargs); | ||
| 3231 | } | ||
| 3232 | |||
| 3145 | wake_up(&fs_info->balance_wait_q); | 3233 | wake_up(&fs_info->balance_wait_q); |
| 3146 | 3234 | ||
| 3147 | return ret; | 3235 | return ret; |
| @@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b) | |||
| 3504 | } | 3592 | } |
| 3505 | 3593 | ||
| 3506 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { | 3594 | struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { |
| 3507 | { 2, 1, 0, 4, 2, 2 /* raid10 */ }, | 3595 | [BTRFS_RAID_RAID10] = { |
| 3508 | { 1, 1, 2, 2, 2, 2 /* raid1 */ }, | 3596 | .sub_stripes = 2, |
| 3509 | { 1, 2, 1, 1, 1, 2 /* dup */ }, | 3597 | .dev_stripes = 1, |
| 3510 | { 1, 1, 0, 2, 1, 1 /* raid0 */ }, | 3598 | .devs_max = 0, /* 0 == as many as possible */ |
| 3511 | { 1, 1, 1, 1, 1, 1 /* single */ }, | 3599 | .devs_min = 4, |
| 3600 | .devs_increment = 2, | ||
| 3601 | .ncopies = 2, | ||
| 3602 | }, | ||
| 3603 | [BTRFS_RAID_RAID1] = { | ||
| 3604 | .sub_stripes = 1, | ||
| 3605 | .dev_stripes = 1, | ||
| 3606 | .devs_max = 2, | ||
| 3607 | .devs_min = 2, | ||
| 3608 | .devs_increment = 2, | ||
| 3609 | .ncopies = 2, | ||
| 3610 | }, | ||
| 3611 | [BTRFS_RAID_DUP] = { | ||
| 3612 | .sub_stripes = 1, | ||
| 3613 | .dev_stripes = 2, | ||
| 3614 | .devs_max = 1, | ||
| 3615 | .devs_min = 1, | ||
| 3616 | .devs_increment = 1, | ||
| 3617 | .ncopies = 2, | ||
| 3618 | }, | ||
| 3619 | [BTRFS_RAID_RAID0] = { | ||
| 3620 | .sub_stripes = 1, | ||
| 3621 | .dev_stripes = 1, | ||
| 3622 | .devs_max = 0, | ||
| 3623 | .devs_min = 2, | ||
| 3624 | .devs_increment = 1, | ||
| 3625 | .ncopies = 1, | ||
| 3626 | }, | ||
| 3627 | [BTRFS_RAID_SINGLE] = { | ||
| 3628 | .sub_stripes = 1, | ||
| 3629 | .dev_stripes = 1, | ||
| 3630 | .devs_max = 1, | ||
| 3631 | .devs_min = 1, | ||
| 3632 | .devs_increment = 1, | ||
| 3633 | .ncopies = 1, | ||
| 3634 | }, | ||
| 3635 | [BTRFS_RAID_RAID5] = { | ||
| 3636 | .sub_stripes = 1, | ||
| 3637 | .dev_stripes = 1, | ||
| 3638 | .devs_max = 0, | ||
| 3639 | .devs_min = 2, | ||
| 3640 | .devs_increment = 1, | ||
| 3641 | .ncopies = 2, | ||
| 3642 | }, | ||
| 3643 | [BTRFS_RAID_RAID6] = { | ||
| 3644 | .sub_stripes = 1, | ||
| 3645 | .dev_stripes = 1, | ||
| 3646 | .devs_max = 0, | ||
| 3647 | .devs_min = 3, | ||
| 3648 | .devs_increment = 1, | ||
| 3649 | .ncopies = 3, | ||
| 3650 | }, | ||
| 3512 | }; | 3651 | }; |
| 3513 | 3652 | ||
| 3653 | static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) | ||
| 3654 | { | ||
| 3655 | /* TODO allow them to set a preferred stripe size */ | ||
| 3656 | return 64 * 1024; | ||
| 3657 | } | ||
| 3658 | |||
| 3659 | static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) | ||
| 3660 | { | ||
| 3661 | u64 features; | ||
| 3662 | |||
| 3663 | if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))) | ||
| 3664 | return; | ||
| 3665 | |||
| 3666 | features = btrfs_super_incompat_flags(info->super_copy); | ||
| 3667 | if (features & BTRFS_FEATURE_INCOMPAT_RAID56) | ||
| 3668 | return; | ||
| 3669 | |||
| 3670 | features |= BTRFS_FEATURE_INCOMPAT_RAID56; | ||
| 3671 | btrfs_set_super_incompat_flags(info->super_copy, features); | ||
| 3672 | printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n"); | ||
| 3673 | } | ||
| 3674 | |||
| 3514 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | 3675 | static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, |
| 3515 | struct btrfs_root *extent_root, | 3676 | struct btrfs_root *extent_root, |
| 3516 | struct map_lookup **map_ret, | 3677 | struct map_lookup **map_ret, |
| @@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
| 3526 | struct btrfs_device_info *devices_info = NULL; | 3687 | struct btrfs_device_info *devices_info = NULL; |
| 3527 | u64 total_avail; | 3688 | u64 total_avail; |
| 3528 | int num_stripes; /* total number of stripes to allocate */ | 3689 | int num_stripes; /* total number of stripes to allocate */ |
| 3690 | int data_stripes; /* number of stripes that count for | ||
| 3691 | block group size */ | ||
| 3529 | int sub_stripes; /* sub_stripes info for map */ | 3692 | int sub_stripes; /* sub_stripes info for map */ |
| 3530 | int dev_stripes; /* stripes per dev */ | 3693 | int dev_stripes; /* stripes per dev */ |
| 3531 | int devs_max; /* max devs to use */ | 3694 | int devs_max; /* max devs to use */ |
| @@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
| 3537 | u64 max_chunk_size; | 3700 | u64 max_chunk_size; |
| 3538 | u64 stripe_size; | 3701 | u64 stripe_size; |
| 3539 | u64 num_bytes; | 3702 | u64 num_bytes; |
| 3703 | u64 raid_stripe_len = BTRFS_STRIPE_LEN; | ||
| 3540 | int ndevs; | 3704 | int ndevs; |
| 3541 | int i; | 3705 | int i; |
| 3542 | int j; | 3706 | int j; |
| @@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
| 3631 | if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) | 3795 | if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) |
| 3632 | continue; | 3796 | continue; |
| 3633 | 3797 | ||
| 3798 | if (ndevs == fs_devices->rw_devices) { | ||
| 3799 | WARN(1, "%s: found more than %llu devices\n", | ||
| 3800 | __func__, fs_devices->rw_devices); | ||
| 3801 | break; | ||
| 3802 | } | ||
| 3634 | devices_info[ndevs].dev_offset = dev_offset; | 3803 | devices_info[ndevs].dev_offset = dev_offset; |
| 3635 | devices_info[ndevs].max_avail = max_avail; | 3804 | devices_info[ndevs].max_avail = max_avail; |
| 3636 | devices_info[ndevs].total_avail = total_avail; | 3805 | devices_info[ndevs].total_avail = total_avail; |
| 3637 | devices_info[ndevs].dev = device; | 3806 | devices_info[ndevs].dev = device; |
| 3638 | ++ndevs; | 3807 | ++ndevs; |
| 3639 | WARN_ON(ndevs > fs_devices->rw_devices); | ||
| 3640 | } | 3808 | } |
| 3641 | 3809 | ||
| 3642 | /* | 3810 | /* |
| @@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
| 3662 | stripe_size = devices_info[ndevs-1].max_avail; | 3830 | stripe_size = devices_info[ndevs-1].max_avail; |
| 3663 | num_stripes = ndevs * dev_stripes; | 3831 | num_stripes = ndevs * dev_stripes; |
| 3664 | 3832 | ||
| 3665 | if (stripe_size * ndevs > max_chunk_size * ncopies) { | 3833 | /* |
| 3666 | stripe_size = max_chunk_size * ncopies; | 3834 | * this will have to be fixed for RAID1 and RAID10 over |
| 3667 | do_div(stripe_size, ndevs); | 3835 | * more drives |
| 3836 | */ | ||
| 3837 | data_stripes = num_stripes / ncopies; | ||
| 3838 | |||
| 3839 | if (type & BTRFS_BLOCK_GROUP_RAID5) { | ||
| 3840 | raid_stripe_len = find_raid56_stripe_len(ndevs - 1, | ||
| 3841 | btrfs_super_stripesize(info->super_copy)); | ||
| 3842 | data_stripes = num_stripes - 1; | ||
| 3843 | } | ||
| 3844 | if (type & BTRFS_BLOCK_GROUP_RAID6) { | ||
| 3845 | raid_stripe_len = find_raid56_stripe_len(ndevs - 2, | ||
| 3846 | btrfs_super_stripesize(info->super_copy)); | ||
| 3847 | data_stripes = num_stripes - 2; | ||
| 3848 | } | ||
| 3849 | |||
| 3850 | /* | ||
| 3851 | * Use the number of data stripes to figure out how big this chunk | ||
| 3852 | * is really going to be in terms of logical address space, | ||
| 3853 | * and compare that answer with the max chunk size | ||
| 3854 | */ | ||
| 3855 | if (stripe_size * data_stripes > max_chunk_size) { | ||
| 3856 | u64 mask = (1ULL << 24) - 1; | ||
| 3857 | stripe_size = max_chunk_size; | ||
| 3858 | do_div(stripe_size, data_stripes); | ||
| 3859 | |||
| 3860 | /* bump the answer up to a 16MB boundary */ | ||
| 3861 | stripe_size = (stripe_size + mask) & ~mask; | ||
| 3862 | |||
| 3863 | /* but don't go higher than the limits we found | ||
| 3864 | * while searching for free extents | ||
| 3865 | */ | ||
| 3866 | if (stripe_size > devices_info[ndevs-1].max_avail) | ||
| 3867 | stripe_size = devices_info[ndevs-1].max_avail; | ||
| 3668 | } | 3868 | } |
| 3669 | 3869 | ||
| 3670 | do_div(stripe_size, dev_stripes); | 3870 | do_div(stripe_size, dev_stripes); |
| 3671 | 3871 | ||
| 3672 | /* align to BTRFS_STRIPE_LEN */ | 3872 | /* align to BTRFS_STRIPE_LEN */ |
| 3673 | do_div(stripe_size, BTRFS_STRIPE_LEN); | 3873 | do_div(stripe_size, raid_stripe_len); |
| 3674 | stripe_size *= BTRFS_STRIPE_LEN; | 3874 | stripe_size *= raid_stripe_len; |
| 3675 | 3875 | ||
| 3676 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); | 3876 | map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); |
| 3677 | if (!map) { | 3877 | if (!map) { |
| @@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
| 3689 | } | 3889 | } |
| 3690 | } | 3890 | } |
| 3691 | map->sector_size = extent_root->sectorsize; | 3891 | map->sector_size = extent_root->sectorsize; |
| 3692 | map->stripe_len = BTRFS_STRIPE_LEN; | 3892 | map->stripe_len = raid_stripe_len; |
| 3693 | map->io_align = BTRFS_STRIPE_LEN; | 3893 | map->io_align = raid_stripe_len; |
| 3694 | map->io_width = BTRFS_STRIPE_LEN; | 3894 | map->io_width = raid_stripe_len; |
| 3695 | map->type = type; | 3895 | map->type = type; |
| 3696 | map->sub_stripes = sub_stripes; | 3896 | map->sub_stripes = sub_stripes; |
| 3697 | 3897 | ||
| 3698 | *map_ret = map; | 3898 | *map_ret = map; |
| 3699 | num_bytes = stripe_size * (num_stripes / ncopies); | 3899 | num_bytes = stripe_size * data_stripes; |
| 3700 | 3900 | ||
| 3701 | *stripe_size_out = stripe_size; | 3901 | *stripe_size_out = stripe_size; |
| 3702 | *num_bytes_out = num_bytes; | 3902 | *num_bytes_out = num_bytes; |
| @@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
| 3718 | write_lock(&em_tree->lock); | 3918 | write_lock(&em_tree->lock); |
| 3719 | ret = add_extent_mapping(em_tree, em); | 3919 | ret = add_extent_mapping(em_tree, em); |
| 3720 | write_unlock(&em_tree->lock); | 3920 | write_unlock(&em_tree->lock); |
| 3721 | free_extent_map(em); | 3921 | if (ret) { |
| 3722 | if (ret) | 3922 | free_extent_map(em); |
| 3723 | goto error; | ||
| 3724 | |||
| 3725 | ret = btrfs_make_block_group(trans, extent_root, 0, type, | ||
| 3726 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
| 3727 | start, num_bytes); | ||
| 3728 | if (ret) | ||
| 3729 | goto error; | 3923 | goto error; |
| 3924 | } | ||
| 3730 | 3925 | ||
| 3731 | for (i = 0; i < map->num_stripes; ++i) { | 3926 | for (i = 0; i < map->num_stripes; ++i) { |
| 3732 | struct btrfs_device *device; | 3927 | struct btrfs_device *device; |
| @@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, | |||
| 3739 | info->chunk_root->root_key.objectid, | 3934 | info->chunk_root->root_key.objectid, |
| 3740 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | 3935 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, |
| 3741 | start, dev_offset, stripe_size); | 3936 | start, dev_offset, stripe_size); |
| 3742 | if (ret) { | 3937 | if (ret) |
| 3743 | btrfs_abort_transaction(trans, extent_root, ret); | 3938 | goto error_dev_extent; |
| 3744 | goto error; | 3939 | } |
| 3745 | } | 3940 | |
| 3941 | ret = btrfs_make_block_group(trans, extent_root, 0, type, | ||
| 3942 | BTRFS_FIRST_CHUNK_TREE_OBJECTID, | ||
| 3943 | start, num_bytes); | ||
| 3944 | if (ret) { | ||
| 3945 | i = map->num_stripes - 1; | ||
| 3946 | goto error_dev_extent; | ||
| 3746 | } | 3947 | } |
| 3747 | 3948 | ||
| 3949 | free_extent_map(em); | ||
| 3950 | check_raid56_incompat_flag(extent_root->fs_info, type); | ||
| 3951 | |||
| 3748 | kfree(devices_info); | 3952 | kfree(devices_info); |
| 3749 | return 0; | 3953 | return 0; |
| 3750 | 3954 | ||
| 3955 | error_dev_extent: | ||
| 3956 | for (; i >= 0; i--) { | ||
| 3957 | struct btrfs_device *device; | ||
| 3958 | int err; | ||
| 3959 | |||
| 3960 | device = map->stripes[i].dev; | ||
| 3961 | err = btrfs_free_dev_extent(trans, device, start); | ||
| 3962 | if (err) { | ||
| 3963 | btrfs_abort_transaction(trans, extent_root, err); | ||
| 3964 | break; | ||
| 3965 | } | ||
| 3966 | } | ||
| 3967 | write_lock(&em_tree->lock); | ||
| 3968 | remove_extent_mapping(em_tree, em); | ||
| 3969 | write_unlock(&em_tree->lock); | ||
| 3970 | |||
| 3971 | /* One for our allocation */ | ||
| 3972 | free_extent_map(em); | ||
| 3973 | /* One for the tree reference */ | ||
| 3974 | free_extent_map(em); | ||
| 3751 | error: | 3975 | error: |
| 3752 | kfree(map); | 3976 | kfree(map); |
| 3753 | kfree(devices_info); | 3977 | kfree(devices_info); |
| @@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
| 3887 | if (ret) | 4111 | if (ret) |
| 3888 | return ret; | 4112 | return ret; |
| 3889 | 4113 | ||
| 3890 | alloc_profile = BTRFS_BLOCK_GROUP_METADATA | | 4114 | alloc_profile = btrfs_get_alloc_profile(extent_root, 0); |
| 3891 | fs_info->avail_metadata_alloc_bits; | ||
| 3892 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | ||
| 3893 | |||
| 3894 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, | 4115 | ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, |
| 3895 | &stripe_size, chunk_offset, alloc_profile); | 4116 | &stripe_size, chunk_offset, alloc_profile); |
| 3896 | if (ret) | 4117 | if (ret) |
| @@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, | |||
| 3898 | 4119 | ||
| 3899 | sys_chunk_offset = chunk_offset + chunk_size; | 4120 | sys_chunk_offset = chunk_offset + chunk_size; |
| 3900 | 4121 | ||
| 3901 | alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | | 4122 | alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); |
| 3902 | fs_info->avail_system_alloc_bits; | ||
| 3903 | alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); | ||
| 3904 | |||
| 3905 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, | 4123 | ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, |
| 3906 | &sys_chunk_size, &sys_stripe_size, | 4124 | &sys_chunk_size, &sys_stripe_size, |
| 3907 | sys_chunk_offset, alloc_profile); | 4125 | sys_chunk_offset, alloc_profile); |
| @@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
| 4014 | ret = map->num_stripes; | 4232 | ret = map->num_stripes; |
| 4015 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4233 | else if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
| 4016 | ret = map->sub_stripes; | 4234 | ret = map->sub_stripes; |
| 4235 | else if (map->type & BTRFS_BLOCK_GROUP_RAID5) | ||
| 4236 | ret = 2; | ||
| 4237 | else if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
| 4238 | ret = 3; | ||
| 4017 | else | 4239 | else |
| 4018 | ret = 1; | 4240 | ret = 1; |
| 4019 | free_extent_map(em); | 4241 | free_extent_map(em); |
| @@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) | |||
| 4026 | return ret; | 4248 | return ret; |
| 4027 | } | 4249 | } |
| 4028 | 4250 | ||
| 4251 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
| 4252 | struct btrfs_mapping_tree *map_tree, | ||
| 4253 | u64 logical) | ||
| 4254 | { | ||
| 4255 | struct extent_map *em; | ||
| 4256 | struct map_lookup *map; | ||
| 4257 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
| 4258 | unsigned long len = root->sectorsize; | ||
| 4259 | |||
| 4260 | read_lock(&em_tree->lock); | ||
| 4261 | em = lookup_extent_mapping(em_tree, logical, len); | ||
| 4262 | read_unlock(&em_tree->lock); | ||
| 4263 | BUG_ON(!em); | ||
| 4264 | |||
| 4265 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
| 4266 | map = (struct map_lookup *)em->bdev; | ||
| 4267 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
| 4268 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
| 4269 | len = map->stripe_len * nr_data_stripes(map); | ||
| 4270 | } | ||
| 4271 | free_extent_map(em); | ||
| 4272 | return len; | ||
| 4273 | } | ||
| 4274 | |||
| 4275 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
| 4276 | u64 logical, u64 len, int mirror_num) | ||
| 4277 | { | ||
| 4278 | struct extent_map *em; | ||
| 4279 | struct map_lookup *map; | ||
| 4280 | struct extent_map_tree *em_tree = &map_tree->map_tree; | ||
| 4281 | int ret = 0; | ||
| 4282 | |||
| 4283 | read_lock(&em_tree->lock); | ||
| 4284 | em = lookup_extent_mapping(em_tree, logical, len); | ||
| 4285 | read_unlock(&em_tree->lock); | ||
| 4286 | BUG_ON(!em); | ||
| 4287 | |||
| 4288 | BUG_ON(em->start > logical || em->start + em->len < logical); | ||
| 4289 | map = (struct map_lookup *)em->bdev; | ||
| 4290 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
| 4291 | BTRFS_BLOCK_GROUP_RAID6)) | ||
| 4292 | ret = 1; | ||
| 4293 | free_extent_map(em); | ||
| 4294 | return ret; | ||
| 4295 | } | ||
| 4296 | |||
| 4029 | static int find_live_mirror(struct btrfs_fs_info *fs_info, | 4297 | static int find_live_mirror(struct btrfs_fs_info *fs_info, |
| 4030 | struct map_lookup *map, int first, int num, | 4298 | struct map_lookup *map, int first, int num, |
| 4031 | int optimal, int dev_replace_is_ongoing) | 4299 | int optimal, int dev_replace_is_ongoing) |
| @@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, | |||
| 4063 | return optimal; | 4331 | return optimal; |
| 4064 | } | 4332 | } |
| 4065 | 4333 | ||
| 4334 | static inline int parity_smaller(u64 a, u64 b) | ||
| 4335 | { | ||
| 4336 | return a > b; | ||
| 4337 | } | ||
| 4338 | |||
| 4339 | /* Bubble-sort the stripe set to put the parity/syndrome stripes last */ | ||
| 4340 | static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map) | ||
| 4341 | { | ||
| 4342 | struct btrfs_bio_stripe s; | ||
| 4343 | int i; | ||
| 4344 | u64 l; | ||
| 4345 | int again = 1; | ||
| 4346 | |||
| 4347 | while (again) { | ||
| 4348 | again = 0; | ||
| 4349 | for (i = 0; i < bbio->num_stripes - 1; i++) { | ||
| 4350 | if (parity_smaller(raid_map[i], raid_map[i+1])) { | ||
| 4351 | s = bbio->stripes[i]; | ||
| 4352 | l = raid_map[i]; | ||
| 4353 | bbio->stripes[i] = bbio->stripes[i+1]; | ||
| 4354 | raid_map[i] = raid_map[i+1]; | ||
| 4355 | bbio->stripes[i+1] = s; | ||
| 4356 | raid_map[i+1] = l; | ||
| 4357 | again = 1; | ||
| 4358 | } | ||
| 4359 | } | ||
| 4360 | } | ||
| 4361 | } | ||
| 4362 | |||
| 4066 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | 4363 | static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, |
| 4067 | u64 logical, u64 *length, | 4364 | u64 logical, u64 *length, |
| 4068 | struct btrfs_bio **bbio_ret, | 4365 | struct btrfs_bio **bbio_ret, |
| 4069 | int mirror_num) | 4366 | int mirror_num, u64 **raid_map_ret) |
| 4070 | { | 4367 | { |
| 4071 | struct extent_map *em; | 4368 | struct extent_map *em; |
| 4072 | struct map_lookup *map; | 4369 | struct map_lookup *map; |
| @@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4078 | u64 stripe_nr; | 4375 | u64 stripe_nr; |
| 4079 | u64 stripe_nr_orig; | 4376 | u64 stripe_nr_orig; |
| 4080 | u64 stripe_nr_end; | 4377 | u64 stripe_nr_end; |
| 4378 | u64 stripe_len; | ||
| 4379 | u64 *raid_map = NULL; | ||
| 4081 | int stripe_index; | 4380 | int stripe_index; |
| 4082 | int i; | 4381 | int i; |
| 4083 | int ret = 0; | 4382 | int ret = 0; |
| @@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4089 | int num_alloc_stripes; | 4388 | int num_alloc_stripes; |
| 4090 | int patch_the_first_stripe_for_dev_replace = 0; | 4389 | int patch_the_first_stripe_for_dev_replace = 0; |
| 4091 | u64 physical_to_patch_in_first_stripe = 0; | 4390 | u64 physical_to_patch_in_first_stripe = 0; |
| 4391 | u64 raid56_full_stripe_start = (u64)-1; | ||
| 4092 | 4392 | ||
| 4093 | read_lock(&em_tree->lock); | 4393 | read_lock(&em_tree->lock); |
| 4094 | em = lookup_extent_mapping(em_tree, logical, *length); | 4394 | em = lookup_extent_mapping(em_tree, logical, *length); |
| @@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4105 | map = (struct map_lookup *)em->bdev; | 4405 | map = (struct map_lookup *)em->bdev; |
| 4106 | offset = logical - em->start; | 4406 | offset = logical - em->start; |
| 4107 | 4407 | ||
| 4408 | if (mirror_num > map->num_stripes) | ||
| 4409 | mirror_num = 0; | ||
| 4410 | |||
| 4411 | stripe_len = map->stripe_len; | ||
| 4108 | stripe_nr = offset; | 4412 | stripe_nr = offset; |
| 4109 | /* | 4413 | /* |
| 4110 | * stripe_nr counts the total number of stripes we have to stride | 4414 | * stripe_nr counts the total number of stripes we have to stride |
| 4111 | * to get to this block | 4415 | * to get to this block |
| 4112 | */ | 4416 | */ |
| 4113 | do_div(stripe_nr, map->stripe_len); | 4417 | do_div(stripe_nr, stripe_len); |
| 4114 | 4418 | ||
| 4115 | stripe_offset = stripe_nr * map->stripe_len; | 4419 | stripe_offset = stripe_nr * stripe_len; |
| 4116 | BUG_ON(offset < stripe_offset); | 4420 | BUG_ON(offset < stripe_offset); |
| 4117 | 4421 | ||
| 4118 | /* stripe_offset is the offset of this block in its stripe*/ | 4422 | /* stripe_offset is the offset of this block in its stripe*/ |
| 4119 | stripe_offset = offset - stripe_offset; | 4423 | stripe_offset = offset - stripe_offset; |
| 4120 | 4424 | ||
| 4121 | if (rw & REQ_DISCARD) | 4425 | /* if we're here for raid56, we need to know the stripe aligned start */ |
| 4426 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
| 4427 | unsigned long full_stripe_len = stripe_len * nr_data_stripes(map); | ||
| 4428 | raid56_full_stripe_start = offset; | ||
| 4429 | |||
| 4430 | /* allow a write of a full stripe, but make sure we don't | ||
| 4431 | * allow straddling of stripes | ||
| 4432 | */ | ||
| 4433 | do_div(raid56_full_stripe_start, full_stripe_len); | ||
| 4434 | raid56_full_stripe_start *= full_stripe_len; | ||
| 4435 | } | ||
| 4436 | |||
| 4437 | if (rw & REQ_DISCARD) { | ||
| 4438 | /* we don't discard raid56 yet */ | ||
| 4439 | if (map->type & | ||
| 4440 | (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
| 4441 | ret = -EOPNOTSUPP; | ||
| 4442 | goto out; | ||
| 4443 | } | ||
| 4122 | *length = min_t(u64, em->len - offset, *length); | 4444 | *length = min_t(u64, em->len - offset, *length); |
| 4123 | else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { | 4445 | } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { |
| 4124 | /* we limit the length of each bio to what fits in a stripe */ | 4446 | u64 max_len; |
| 4125 | *length = min_t(u64, em->len - offset, | 4447 | /* For writes to RAID[56], allow a full stripeset across all disks. |
| 4126 | map->stripe_len - stripe_offset); | 4448 | For other RAID types and for RAID[56] reads, just allow a single |
| 4449 | stripe (on a single disk). */ | ||
| 4450 | if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) && | ||
| 4451 | (rw & REQ_WRITE)) { | ||
| 4452 | max_len = stripe_len * nr_data_stripes(map) - | ||
| 4453 | (offset - raid56_full_stripe_start); | ||
| 4454 | } else { | ||
| 4455 | /* we limit the length of each bio to what fits in a stripe */ | ||
| 4456 | max_len = stripe_len - stripe_offset; | ||
| 4457 | } | ||
| 4458 | *length = min_t(u64, em->len - offset, max_len); | ||
| 4127 | } else { | 4459 | } else { |
| 4128 | *length = em->len - offset; | 4460 | *length = em->len - offset; |
| 4129 | } | 4461 | } |
| 4130 | 4462 | ||
| 4463 | /* This is for when we're called from btrfs_merge_bio_hook() and all | ||
| 4464 | it cares about is the length */ | ||
| 4131 | if (!bbio_ret) | 4465 | if (!bbio_ret) |
| 4132 | goto out; | 4466 | goto out; |
| 4133 | 4467 | ||
| @@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4160 | u64 physical_of_found = 0; | 4494 | u64 physical_of_found = 0; |
| 4161 | 4495 | ||
| 4162 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, | 4496 | ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, |
| 4163 | logical, &tmp_length, &tmp_bbio, 0); | 4497 | logical, &tmp_length, &tmp_bbio, 0, NULL); |
| 4164 | if (ret) { | 4498 | if (ret) { |
| 4165 | WARN_ON(tmp_bbio != NULL); | 4499 | WARN_ON(tmp_bbio != NULL); |
| 4166 | goto out; | 4500 | goto out; |
| @@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4221 | num_stripes = 1; | 4555 | num_stripes = 1; |
| 4222 | stripe_index = 0; | 4556 | stripe_index = 0; |
| 4223 | stripe_nr_orig = stripe_nr; | 4557 | stripe_nr_orig = stripe_nr; |
| 4224 | stripe_nr_end = (offset + *length + map->stripe_len - 1) & | 4558 | stripe_nr_end = ALIGN(offset + *length, map->stripe_len); |
| 4225 | (~(map->stripe_len - 1)); | ||
| 4226 | do_div(stripe_nr_end, map->stripe_len); | 4559 | do_div(stripe_nr_end, map->stripe_len); |
| 4227 | stripe_end_offset = stripe_nr_end * map->stripe_len - | 4560 | stripe_end_offset = stripe_nr_end * map->stripe_len - |
| 4228 | (offset + *length); | 4561 | (offset + *length); |
| 4562 | |||
| 4229 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4563 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
| 4230 | if (rw & REQ_DISCARD) | 4564 | if (rw & REQ_DISCARD) |
| 4231 | num_stripes = min_t(u64, map->num_stripes, | 4565 | num_stripes = min_t(u64, map->num_stripes, |
| @@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4276 | dev_replace_is_ongoing); | 4610 | dev_replace_is_ongoing); |
| 4277 | mirror_num = stripe_index - old_stripe_index + 1; | 4611 | mirror_num = stripe_index - old_stripe_index + 1; |
| 4278 | } | 4612 | } |
| 4613 | |||
| 4614 | } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
| 4615 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
| 4616 | u64 tmp; | ||
| 4617 | |||
| 4618 | if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1) | ||
| 4619 | && raid_map_ret) { | ||
| 4620 | int i, rot; | ||
| 4621 | |||
| 4622 | /* push stripe_nr back to the start of the full stripe */ | ||
| 4623 | stripe_nr = raid56_full_stripe_start; | ||
| 4624 | do_div(stripe_nr, stripe_len); | ||
| 4625 | |||
| 4626 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
| 4627 | |||
| 4628 | /* RAID[56] write or recovery. Return all stripes */ | ||
| 4629 | num_stripes = map->num_stripes; | ||
| 4630 | max_errors = nr_parity_stripes(map); | ||
| 4631 | |||
| 4632 | raid_map = kmalloc(sizeof(u64) * num_stripes, | ||
| 4633 | GFP_NOFS); | ||
| 4634 | if (!raid_map) { | ||
| 4635 | ret = -ENOMEM; | ||
| 4636 | goto out; | ||
| 4637 | } | ||
| 4638 | |||
| 4639 | /* Work out the disk rotation on this stripe-set */ | ||
| 4640 | tmp = stripe_nr; | ||
| 4641 | rot = do_div(tmp, num_stripes); | ||
| 4642 | |||
| 4643 | /* Fill in the logical address of each stripe */ | ||
| 4644 | tmp = stripe_nr * nr_data_stripes(map); | ||
| 4645 | for (i = 0; i < nr_data_stripes(map); i++) | ||
| 4646 | raid_map[(i+rot) % num_stripes] = | ||
| 4647 | em->start + (tmp + i) * map->stripe_len; | ||
| 4648 | |||
| 4649 | raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE; | ||
| 4650 | if (map->type & BTRFS_BLOCK_GROUP_RAID6) | ||
| 4651 | raid_map[(i+rot+1) % num_stripes] = | ||
| 4652 | RAID6_Q_STRIPE; | ||
| 4653 | |||
| 4654 | *length = map->stripe_len; | ||
| 4655 | stripe_index = 0; | ||
| 4656 | stripe_offset = 0; | ||
| 4657 | } else { | ||
| 4658 | /* | ||
| 4659 | * Mirror #0 or #1 means the original data block. | ||
| 4660 | * Mirror #2 is RAID5 parity block. | ||
| 4661 | * Mirror #3 is RAID6 Q block. | ||
| 4662 | */ | ||
| 4663 | stripe_index = do_div(stripe_nr, nr_data_stripes(map)); | ||
| 4664 | if (mirror_num > 1) | ||
| 4665 | stripe_index = nr_data_stripes(map) + | ||
| 4666 | mirror_num - 2; | ||
| 4667 | |||
| 4668 | /* We distribute the parity blocks across stripes */ | ||
| 4669 | tmp = stripe_nr + stripe_index; | ||
| 4670 | stripe_index = do_div(tmp, map->num_stripes); | ||
| 4671 | } | ||
| 4279 | } else { | 4672 | } else { |
| 4280 | /* | 4673 | /* |
| 4281 | * after this do_div call, stripe_nr is the number of stripes | 4674 | * after this do_div call, stripe_nr is the number of stripes |
| @@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4384 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { | 4777 | if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { |
| 4385 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | | 4778 | if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | |
| 4386 | BTRFS_BLOCK_GROUP_RAID10 | | 4779 | BTRFS_BLOCK_GROUP_RAID10 | |
| 4780 | BTRFS_BLOCK_GROUP_RAID5 | | ||
| 4387 | BTRFS_BLOCK_GROUP_DUP)) { | 4781 | BTRFS_BLOCK_GROUP_DUP)) { |
| 4388 | max_errors = 1; | 4782 | max_errors = 1; |
| 4783 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { | ||
| 4784 | max_errors = 2; | ||
| 4389 | } | 4785 | } |
| 4390 | } | 4786 | } |
| 4391 | 4787 | ||
| @@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4486 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; | 4882 | bbio->stripes[0].physical = physical_to_patch_in_first_stripe; |
| 4487 | bbio->mirror_num = map->num_stripes + 1; | 4883 | bbio->mirror_num = map->num_stripes + 1; |
| 4488 | } | 4884 | } |
| 4885 | if (raid_map) { | ||
| 4886 | sort_parity_stripes(bbio, raid_map); | ||
| 4887 | *raid_map_ret = raid_map; | ||
| 4888 | } | ||
| 4489 | out: | 4889 | out: |
| 4490 | if (dev_replace_is_ongoing) | 4890 | if (dev_replace_is_ongoing) |
| 4491 | btrfs_dev_replace_unlock(dev_replace); | 4891 | btrfs_dev_replace_unlock(dev_replace); |
| @@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, | |||
| 4498 | struct btrfs_bio **bbio_ret, int mirror_num) | 4898 | struct btrfs_bio **bbio_ret, int mirror_num) |
| 4499 | { | 4899 | { |
| 4500 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, | 4900 | return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret, |
| 4501 | mirror_num); | 4901 | mirror_num, NULL); |
| 4502 | } | 4902 | } |
| 4503 | 4903 | ||
| 4504 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | 4904 | int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, |
| @@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
| 4512 | u64 bytenr; | 4912 | u64 bytenr; |
| 4513 | u64 length; | 4913 | u64 length; |
| 4514 | u64 stripe_nr; | 4914 | u64 stripe_nr; |
| 4915 | u64 rmap_len; | ||
| 4515 | int i, j, nr = 0; | 4916 | int i, j, nr = 0; |
| 4516 | 4917 | ||
| 4517 | read_lock(&em_tree->lock); | 4918 | read_lock(&em_tree->lock); |
| @@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
| 4522 | map = (struct map_lookup *)em->bdev; | 4923 | map = (struct map_lookup *)em->bdev; |
| 4523 | 4924 | ||
| 4524 | length = em->len; | 4925 | length = em->len; |
| 4926 | rmap_len = map->stripe_len; | ||
| 4927 | |||
| 4525 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) | 4928 | if (map->type & BTRFS_BLOCK_GROUP_RAID10) |
| 4526 | do_div(length, map->num_stripes / map->sub_stripes); | 4929 | do_div(length, map->num_stripes / map->sub_stripes); |
| 4527 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) | 4930 | else if (map->type & BTRFS_BLOCK_GROUP_RAID0) |
| 4528 | do_div(length, map->num_stripes); | 4931 | do_div(length, map->num_stripes); |
| 4932 | else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | | ||
| 4933 | BTRFS_BLOCK_GROUP_RAID6)) { | ||
| 4934 | do_div(length, nr_data_stripes(map)); | ||
| 4935 | rmap_len = map->stripe_len * nr_data_stripes(map); | ||
| 4936 | } | ||
| 4529 | 4937 | ||
| 4530 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); | 4938 | buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); |
| 4531 | BUG_ON(!buf); /* -ENOMEM */ | 4939 | BUG_ON(!buf); /* -ENOMEM */ |
| @@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
| 4545 | do_div(stripe_nr, map->sub_stripes); | 4953 | do_div(stripe_nr, map->sub_stripes); |
| 4546 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 4954 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
| 4547 | stripe_nr = stripe_nr * map->num_stripes + i; | 4955 | stripe_nr = stripe_nr * map->num_stripes + i; |
| 4548 | } | 4956 | } /* else if RAID[56], multiply by nr_data_stripes(). |
| 4549 | bytenr = chunk_start + stripe_nr * map->stripe_len; | 4957 | * Alternatively, just use rmap_len below instead of |
| 4958 | * map->stripe_len */ | ||
| 4959 | |||
| 4960 | bytenr = chunk_start + stripe_nr * rmap_len; | ||
| 4550 | WARN_ON(nr >= map->num_stripes); | 4961 | WARN_ON(nr >= map->num_stripes); |
| 4551 | for (j = 0; j < nr; j++) { | 4962 | for (j = 0; j < nr; j++) { |
| 4552 | if (buf[j] == bytenr) | 4963 | if (buf[j] == bytenr) |
| @@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, | |||
| 4560 | 4971 | ||
| 4561 | *logical = buf; | 4972 | *logical = buf; |
| 4562 | *naddrs = nr; | 4973 | *naddrs = nr; |
| 4563 | *stripe_len = map->stripe_len; | 4974 | *stripe_len = rmap_len; |
| 4564 | 4975 | ||
| 4565 | free_extent_map(em); | 4976 | free_extent_map(em); |
| 4566 | return 0; | 4977 | return 0; |
| @@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err) | |||
| 4634 | bio->bi_bdev = (struct block_device *) | 5045 | bio->bi_bdev = (struct block_device *) |
| 4635 | (unsigned long)bbio->mirror_num; | 5046 | (unsigned long)bbio->mirror_num; |
| 4636 | /* only send an error to the higher layers if it is | 5047 | /* only send an error to the higher layers if it is |
| 4637 | * beyond the tolerance of the multi-bio | 5048 | * beyond the tolerance of the btrfs bio |
| 4638 | */ | 5049 | */ |
| 4639 | if (atomic_read(&bbio->error) > bbio->max_errors) { | 5050 | if (atomic_read(&bbio->error) > bbio->max_errors) { |
| 4640 | err = -EIO; | 5051 | err = -EIO; |
| @@ -4668,13 +5079,18 @@ struct async_sched { | |||
| 4668 | * This will add one bio to the pending list for a device and make sure | 5079 | * This will add one bio to the pending list for a device and make sure |
| 4669 | * the work struct is scheduled. | 5080 | * the work struct is scheduled. |
| 4670 | */ | 5081 | */ |
| 4671 | static noinline void schedule_bio(struct btrfs_root *root, | 5082 | noinline void btrfs_schedule_bio(struct btrfs_root *root, |
| 4672 | struct btrfs_device *device, | 5083 | struct btrfs_device *device, |
| 4673 | int rw, struct bio *bio) | 5084 | int rw, struct bio *bio) |
| 4674 | { | 5085 | { |
| 4675 | int should_queue = 1; | 5086 | int should_queue = 1; |
| 4676 | struct btrfs_pending_bios *pending_bios; | 5087 | struct btrfs_pending_bios *pending_bios; |
| 4677 | 5088 | ||
| 5089 | if (device->missing || !device->bdev) { | ||
| 5090 | bio_endio(bio, -EIO); | ||
| 5091 | return; | ||
| 5092 | } | ||
| 5093 | |||
| 4678 | /* don't bother with additional async steps for reads, right now */ | 5094 | /* don't bother with additional async steps for reads, right now */ |
| 4679 | if (!(rw & REQ_WRITE)) { | 5095 | if (!(rw & REQ_WRITE)) { |
| 4680 | bio_get(bio); | 5096 | bio_get(bio); |
| @@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, | |||
| 4772 | #endif | 5188 | #endif |
| 4773 | bio->bi_bdev = dev->bdev; | 5189 | bio->bi_bdev = dev->bdev; |
| 4774 | if (async) | 5190 | if (async) |
| 4775 | schedule_bio(root, dev, rw, bio); | 5191 | btrfs_schedule_bio(root, dev, rw, bio); |
| 4776 | else | 5192 | else |
| 4777 | btrfsic_submit_bio(rw, bio); | 5193 | btrfsic_submit_bio(rw, bio); |
| 4778 | } | 5194 | } |
| @@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
| 4831 | u64 logical = (u64)bio->bi_sector << 9; | 5247 | u64 logical = (u64)bio->bi_sector << 9; |
| 4832 | u64 length = 0; | 5248 | u64 length = 0; |
| 4833 | u64 map_length; | 5249 | u64 map_length; |
| 5250 | u64 *raid_map = NULL; | ||
| 4834 | int ret; | 5251 | int ret; |
| 4835 | int dev_nr = 0; | 5252 | int dev_nr = 0; |
| 4836 | int total_devs = 1; | 5253 | int total_devs = 1; |
| @@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
| 4839 | length = bio->bi_size; | 5256 | length = bio->bi_size; |
| 4840 | map_length = length; | 5257 | map_length = length; |
| 4841 | 5258 | ||
| 4842 | ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, | 5259 | ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio, |
| 4843 | mirror_num); | 5260 | mirror_num, &raid_map); |
| 4844 | if (ret) | 5261 | if (ret) /* -ENOMEM */ |
| 4845 | return ret; | 5262 | return ret; |
| 4846 | 5263 | ||
| 4847 | total_devs = bbio->num_stripes; | 5264 | total_devs = bbio->num_stripes; |
| 5265 | bbio->orig_bio = first_bio; | ||
| 5266 | bbio->private = first_bio->bi_private; | ||
| 5267 | bbio->end_io = first_bio->bi_end_io; | ||
| 5268 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
| 5269 | |||
| 5270 | if (raid_map) { | ||
| 5271 | /* In this case, map_length has been set to the length of | ||
| 5272 | a single stripe; not the whole write */ | ||
| 5273 | if (rw & WRITE) { | ||
| 5274 | return raid56_parity_write(root, bio, bbio, | ||
| 5275 | raid_map, map_length); | ||
| 5276 | } else { | ||
| 5277 | return raid56_parity_recover(root, bio, bbio, | ||
| 5278 | raid_map, map_length, | ||
| 5279 | mirror_num); | ||
| 5280 | } | ||
| 5281 | } | ||
| 5282 | |||
| 4848 | if (map_length < length) { | 5283 | if (map_length < length) { |
| 4849 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " | 5284 | printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu " |
| 4850 | "len %llu\n", (unsigned long long)logical, | 5285 | "len %llu\n", (unsigned long long)logical, |
| @@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, | |||
| 4853 | BUG(); | 5288 | BUG(); |
| 4854 | } | 5289 | } |
| 4855 | 5290 | ||
| 4856 | bbio->orig_bio = first_bio; | ||
| 4857 | bbio->private = first_bio->bi_private; | ||
| 4858 | bbio->end_io = first_bio->bi_end_io; | ||
| 4859 | atomic_set(&bbio->stripes_pending, bbio->num_stripes); | ||
| 4860 | |||
| 4861 | while (dev_nr < total_devs) { | 5291 | while (dev_nr < total_devs) { |
| 4862 | dev = bbio->stripes[dev_nr].dev; | 5292 | dev = bbio->stripes[dev_nr].dev; |
| 4863 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { | 5293 | if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) { |
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d3c3939ac751..062d8604d35b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h | |||
| @@ -21,8 +21,8 @@ | |||
| 21 | 21 | ||
| 22 | #include <linux/bio.h> | 22 | #include <linux/bio.h> |
| 23 | #include <linux/sort.h> | 23 | #include <linux/sort.h> |
| 24 | #include <linux/btrfs.h> | ||
| 24 | #include "async-thread.h" | 25 | #include "async-thread.h" |
| 25 | #include "ioctl.h" | ||
| 26 | 26 | ||
| 27 | #define BTRFS_STRIPE_LEN (64 * 1024) | 27 | #define BTRFS_STRIPE_LEN (64 * 1024) |
| 28 | 28 | ||
| @@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, | |||
| 321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, | 321 | void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, |
| 322 | struct btrfs_device *tgtdev); | 322 | struct btrfs_device *tgtdev); |
| 323 | int btrfs_scratch_superblock(struct btrfs_device *device); | 323 | int btrfs_scratch_superblock(struct btrfs_device *device); |
| 324 | 324 | void btrfs_schedule_bio(struct btrfs_root *root, | |
| 325 | struct btrfs_device *device, | ||
| 326 | int rw, struct bio *bio); | ||
| 327 | int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, | ||
| 328 | u64 logical, u64 len, int mirror_num); | ||
| 329 | unsigned long btrfs_full_stripe_len(struct btrfs_root *root, | ||
| 330 | struct btrfs_mapping_tree *map_tree, | ||
| 331 | u64 logical); | ||
| 325 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, | 332 | static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, |
| 326 | int index) | 333 | int index) |
| 327 | { | 334 | { |
