aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 20:17:13 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-06 20:17:13 -0500
commit27eb427bdc0960ad64b72da03e3596c801e7a9e9 (patch)
tree4170a265e99d455ca53d26a19e59330e3277fccd
parent713009809681e5a7871e96e6992692c805b4480b (diff)
parent2959a32a858a2c44bbbce83d19c158d54cc5998a (diff)
Merge branch 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "We have a lot of subvolume quota improvements in here, along with big piles of cleanups from Dave Sterba and Anand Jain and others. Josef pitched in a batch of allocator fixes based on production use here at FB. We found that mount -o ssd_spread greatly improved our performance on hardware raid5/6, but it exposed some CPU bottlenecks in the allocator. These patches make a huge difference" * 'for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (100 commits) Btrfs: fix hole punching when using the no-holes feature Btrfs: find_free_extent: Do not erroneously skip LOOP_CACHING_WAIT state btrfs: Fix a data space underflow warning btrfs: qgroup: Fix a rebase bug which will cause qgroup double free btrfs: qgroup: Fix a race in delayed_ref which leads to abort trans btrfs: clear PF_NOFREEZE in cleaner_kthread() btrfs: qgroup: Don't copy extent buffer to do qgroup rescan btrfs: add balance filters limits, stripes and usage to supported mask btrfs: extend balance filter usage to take minimum and maximum btrfs: add balance filter for stripes btrfs: extend balance filter limit to take minimum and maximum btrfs: fix use after free iterating extrefs btrfs: check unsupported filters in balance arguments Btrfs: fix regression running delayed references when using qgroups Btrfs: fix regression when running delayed references Btrfs: don't do extra bitmap search in one bit case Btrfs: keep track of largest extent in bitmaps Btrfs: don't keep trying to build clusters if we are fragmented Btrfs: cut down on loops through the allocator Btrfs: don't continue setting up space cache when enospc ...
-rw-r--r--fs/btrfs/backref.c6
-rw-r--r--fs/btrfs/check-integrity.c8
-rw-r--r--fs/btrfs/compression.c97
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h180
-rw-r--r--fs/btrfs/delayed-inode.c4
-rw-r--r--fs/btrfs/delayed-ref.c190
-rw-r--r--fs/btrfs/delayed-ref.h23
-rw-r--r--fs/btrfs/dev-replace.c55
-rw-r--r--fs/btrfs/disk-io.c166
-rw-r--r--fs/btrfs/disk-io.h2
-rw-r--r--fs/btrfs/extent-tree.c572
-rw-r--r--fs/btrfs/extent_io.c155
-rw-r--r--fs/btrfs/extent_io.h19
-rw-r--r--fs/btrfs/file.c228
-rw-r--r--fs/btrfs/free-space-cache.c79
-rw-r--r--fs/btrfs/free-space-cache.h1
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode-map.c6
-rw-r--r--fs/btrfs/inode.c194
-rw-r--r--fs/btrfs/ioctl.c280
-rw-r--r--fs/btrfs/locking.c12
-rw-r--r--fs/btrfs/ordered-data.c70
-rw-r--r--fs/btrfs/ordered-data.h2
-rw-r--r--fs/btrfs/props.c13
-rw-r--r--fs/btrfs/qgroup.c229
-rw-r--r--fs/btrfs/qgroup.h31
-rw-r--r--fs/btrfs/raid56.c6
-rw-r--r--fs/btrfs/reada.c8
-rw-r--r--fs/btrfs/relocation.c26
-rw-r--r--fs/btrfs/root-tree.c11
-rw-r--r--fs/btrfs/scrub.c42
-rw-r--r--fs/btrfs/send.c212
-rw-r--r--fs/btrfs/super.c57
-rw-r--r--fs/btrfs/sysfs.c52
-rw-r--r--fs/btrfs/sysfs.h4
-rw-r--r--fs/btrfs/tests/free-space-tests.c22
-rw-r--r--fs/btrfs/transaction.c129
-rw-r--r--fs/btrfs/transaction.h20
-rw-r--r--fs/btrfs/tree-log.c24
-rw-r--r--fs/btrfs/volumes.c427
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--include/trace/events/btrfs.h113
-rw-r--r--include/uapi/linux/btrfs.h31
44 files changed, 2768 insertions, 1068 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 9a2ec79e8cfb..6dcdb2ec9211 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -362,6 +362,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
362 goto out; 362 goto out;
363 } 363 }
364 364
365 if (btrfs_test_is_dummy_root(root)) {
366 srcu_read_unlock(&fs_info->subvol_srcu, index);
367 ret = -ENOENT;
368 goto out;
369 }
370
365 if (path->search_commit_root) 371 if (path->search_commit_root)
366 root_level = btrfs_header_level(root->commit_root); 372 root_level = btrfs_header_level(root->commit_root);
367 else if (time_seq == (u64)-1) 373 else if (time_seq == (u64)-1)
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 541fbfaed276..0340c57bf377 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -667,7 +667,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
667 selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); 667 selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
668 if (NULL == selected_super) { 668 if (NULL == selected_super) {
669 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n"); 669 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
670 return -1; 670 return -ENOMEM;
671 } 671 }
672 672
673 list_for_each_entry(device, dev_head, dev_list) { 673 list_for_each_entry(device, dev_head, dev_list) {
@@ -845,8 +845,8 @@ static int btrfsic_process_superblock_dev_mirror(
845 superblock_tmp->never_written = 0; 845 superblock_tmp->never_written = 0;
846 superblock_tmp->mirror_num = 1 + superblock_mirror_num; 846 superblock_tmp->mirror_num = 1 + superblock_mirror_num;
847 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE) 847 if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
848 printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)" 848 btrfs_info_in_rcu(device->dev_root->fs_info,
849 " @%llu (%s/%llu/%d)\n", 849 "new initial S-block (bdev %p, %s) @%llu (%s/%llu/%d)",
850 superblock_bdev, 850 superblock_bdev,
851 rcu_str_deref(device->name), dev_bytenr, 851 rcu_str_deref(device->name), dev_bytenr,
852 dev_state->name, dev_bytenr, 852 dev_state->name, dev_bytenr,
@@ -1660,7 +1660,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
1660 sizeof(*block_ctx->pagev)) * 1660 sizeof(*block_ctx->pagev)) *
1661 num_pages, GFP_NOFS); 1661 num_pages, GFP_NOFS);
1662 if (!block_ctx->mem_to_free) 1662 if (!block_ctx->mem_to_free)
1663 return -1; 1663 return -ENOMEM;
1664 block_ctx->datav = block_ctx->mem_to_free; 1664 block_ctx->datav = block_ctx->mem_to_free;
1665 block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); 1665 block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
1666 for (i = 0; i < num_pages; i++) { 1666 for (i = 0; i < num_pages; i++) {
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 57ee8ca29b06..97b049ad0594 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -745,11 +745,13 @@ out:
745 return ret; 745 return ret;
746} 746}
747 747
748static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES]; 748static struct {
749static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES]; 749 struct list_head idle_ws;
750static int comp_num_workspace[BTRFS_COMPRESS_TYPES]; 750 spinlock_t ws_lock;
751static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES]; 751 int num_ws;
752static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES]; 752 atomic_t alloc_ws;
753 wait_queue_head_t ws_wait;
754} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
753 755
754static const struct btrfs_compress_op * const btrfs_compress_op[] = { 756static const struct btrfs_compress_op * const btrfs_compress_op[] = {
755 &btrfs_zlib_compress, 757 &btrfs_zlib_compress,
@@ -761,10 +763,10 @@ void __init btrfs_init_compress(void)
761 int i; 763 int i;
762 764
763 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 765 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
764 INIT_LIST_HEAD(&comp_idle_workspace[i]); 766 INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
765 spin_lock_init(&comp_workspace_lock[i]); 767 spin_lock_init(&btrfs_comp_ws[i].ws_lock);
766 atomic_set(&comp_alloc_workspace[i], 0); 768 atomic_set(&btrfs_comp_ws[i].alloc_ws, 0);
767 init_waitqueue_head(&comp_workspace_wait[i]); 769 init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
768 } 770 }
769} 771}
770 772
@@ -778,38 +780,38 @@ static struct list_head *find_workspace(int type)
778 int cpus = num_online_cpus(); 780 int cpus = num_online_cpus();
779 int idx = type - 1; 781 int idx = type - 1;
780 782
781 struct list_head *idle_workspace = &comp_idle_workspace[idx]; 783 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
782 spinlock_t *workspace_lock = &comp_workspace_lock[idx]; 784 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
783 atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; 785 atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
784 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; 786 wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
785 int *num_workspace = &comp_num_workspace[idx]; 787 int *num_ws = &btrfs_comp_ws[idx].num_ws;
786again: 788again:
787 spin_lock(workspace_lock); 789 spin_lock(ws_lock);
788 if (!list_empty(idle_workspace)) { 790 if (!list_empty(idle_ws)) {
789 workspace = idle_workspace->next; 791 workspace = idle_ws->next;
790 list_del(workspace); 792 list_del(workspace);
791 (*num_workspace)--; 793 (*num_ws)--;
792 spin_unlock(workspace_lock); 794 spin_unlock(ws_lock);
793 return workspace; 795 return workspace;
794 796
795 } 797 }
796 if (atomic_read(alloc_workspace) > cpus) { 798 if (atomic_read(alloc_ws) > cpus) {
797 DEFINE_WAIT(wait); 799 DEFINE_WAIT(wait);
798 800
799 spin_unlock(workspace_lock); 801 spin_unlock(ws_lock);
800 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE); 802 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
801 if (atomic_read(alloc_workspace) > cpus && !*num_workspace) 803 if (atomic_read(alloc_ws) > cpus && !*num_ws)
802 schedule(); 804 schedule();
803 finish_wait(workspace_wait, &wait); 805 finish_wait(ws_wait, &wait);
804 goto again; 806 goto again;
805 } 807 }
806 atomic_inc(alloc_workspace); 808 atomic_inc(alloc_ws);
807 spin_unlock(workspace_lock); 809 spin_unlock(ws_lock);
808 810
809 workspace = btrfs_compress_op[idx]->alloc_workspace(); 811 workspace = btrfs_compress_op[idx]->alloc_workspace();
810 if (IS_ERR(workspace)) { 812 if (IS_ERR(workspace)) {
811 atomic_dec(alloc_workspace); 813 atomic_dec(alloc_ws);
812 wake_up(workspace_wait); 814 wake_up(ws_wait);
813 } 815 }
814 return workspace; 816 return workspace;
815} 817}
@@ -821,27 +823,30 @@ again:
821static void free_workspace(int type, struct list_head *workspace) 823static void free_workspace(int type, struct list_head *workspace)
822{ 824{
823 int idx = type - 1; 825 int idx = type - 1;
824 struct list_head *idle_workspace = &comp_idle_workspace[idx]; 826 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
825 spinlock_t *workspace_lock = &comp_workspace_lock[idx]; 827 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
826 atomic_t *alloc_workspace = &comp_alloc_workspace[idx]; 828 atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws;
827 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx]; 829 wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
828 int *num_workspace = &comp_num_workspace[idx]; 830 int *num_ws = &btrfs_comp_ws[idx].num_ws;
829 831
830 spin_lock(workspace_lock); 832 spin_lock(ws_lock);
831 if (*num_workspace < num_online_cpus()) { 833 if (*num_ws < num_online_cpus()) {
832 list_add(workspace, idle_workspace); 834 list_add(workspace, idle_ws);
833 (*num_workspace)++; 835 (*num_ws)++;
834 spin_unlock(workspace_lock); 836 spin_unlock(ws_lock);
835 goto wake; 837 goto wake;
836 } 838 }
837 spin_unlock(workspace_lock); 839 spin_unlock(ws_lock);
838 840
839 btrfs_compress_op[idx]->free_workspace(workspace); 841 btrfs_compress_op[idx]->free_workspace(workspace);
840 atomic_dec(alloc_workspace); 842 atomic_dec(alloc_ws);
841wake: 843wake:
844 /*
845 * Make sure counter is updated before we wake up waiters.
846 */
842 smp_mb(); 847 smp_mb();
843 if (waitqueue_active(workspace_wait)) 848 if (waitqueue_active(ws_wait))
844 wake_up(workspace_wait); 849 wake_up(ws_wait);
845} 850}
846 851
847/* 852/*
@@ -853,11 +858,11 @@ static void free_workspaces(void)
853 int i; 858 int i;
854 859
855 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 860 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
856 while (!list_empty(&comp_idle_workspace[i])) { 861 while (!list_empty(&btrfs_comp_ws[i].idle_ws)) {
857 workspace = comp_idle_workspace[i].next; 862 workspace = btrfs_comp_ws[i].idle_ws.next;
858 list_del(workspace); 863 list_del(workspace);
859 btrfs_compress_op[i]->free_workspace(workspace); 864 btrfs_compress_op[i]->free_workspace(workspace);
860 atomic_dec(&comp_alloc_workspace[i]); 865 atomic_dec(&btrfs_comp_ws[i].alloc_ws);
861 } 866 }
862 } 867 }
863} 868}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5f745eadf77d..5b8e235c4b6d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1011 return ret; 1011 return ret;
1012 if (refs == 0) { 1012 if (refs == 0) {
1013 ret = -EROFS; 1013 ret = -EROFS;
1014 btrfs_std_error(root->fs_info, ret); 1014 btrfs_std_error(root->fs_info, ret, NULL);
1015 return ret; 1015 return ret;
1016 } 1016 }
1017 } else { 1017 } else {
@@ -1927,7 +1927,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1927 child = read_node_slot(root, mid, 0); 1927 child = read_node_slot(root, mid, 0);
1928 if (!child) { 1928 if (!child) {
1929 ret = -EROFS; 1929 ret = -EROFS;
1930 btrfs_std_error(root->fs_info, ret); 1930 btrfs_std_error(root->fs_info, ret, NULL);
1931 goto enospc; 1931 goto enospc;
1932 } 1932 }
1933 1933
@@ -2030,7 +2030,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
2030 */ 2030 */
2031 if (!left) { 2031 if (!left) {
2032 ret = -EROFS; 2032 ret = -EROFS;
2033 btrfs_std_error(root->fs_info, ret); 2033 btrfs_std_error(root->fs_info, ret, NULL);
2034 goto enospc; 2034 goto enospc;
2035 } 2035 }
2036 wret = balance_node_right(trans, root, mid, left); 2036 wret = balance_node_right(trans, root, mid, left);
@@ -4940,8 +4940,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4940{ 4940{
4941 struct extent_buffer *leaf; 4941 struct extent_buffer *leaf;
4942 struct btrfs_item *item; 4942 struct btrfs_item *item;
4943 int last_off; 4943 u32 last_off;
4944 int dsize = 0; 4944 u32 dsize = 0;
4945 int ret = 0; 4945 int ret = 0;
4946 int wret; 4946 int wret;
4947 int i; 4947 int i;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 938efe33be80..a2e73f6053a8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -823,8 +823,18 @@ struct btrfs_disk_balance_args {
823 */ 823 */
824 __le64 profiles; 824 __le64 profiles;
825 825
826 /* usage filter */ 826 /*
827 __le64 usage; 827 * usage filter
828 * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
829 * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
830 */
831 union {
832 __le64 usage;
833 struct {
834 __le32 usage_min;
835 __le32 usage_max;
836 };
837 };
828 838
829 /* devid filter */ 839 /* devid filter */
830 __le64 devid; 840 __le64 devid;
@@ -846,10 +856,27 @@ struct btrfs_disk_balance_args {
846 /* BTRFS_BALANCE_ARGS_* */ 856 /* BTRFS_BALANCE_ARGS_* */
847 __le64 flags; 857 __le64 flags;
848 858
849 /* BTRFS_BALANCE_ARGS_LIMIT value */ 859 /*
850 __le64 limit; 860 * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
861 * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
862 * and maximum
863 */
864 union {
865 __le64 limit;
866 struct {
867 __le32 limit_min;
868 __le32 limit_max;
869 };
870 };
851 871
852 __le64 unused[7]; 872 /*
873 * Process chunks that cross stripes_min..stripes_max devices,
874 * BTRFS_BALANCE_ARGS_STRIPES_RANGE
875 */
876 __le32 stripes_min;
877 __le32 stripes_max;
878
879 __le64 unused[6];
853} __attribute__ ((__packed__)); 880} __attribute__ ((__packed__));
854 881
855/* 882/*
@@ -1154,6 +1181,10 @@ struct btrfs_space_info {
1154 delalloc/allocations */ 1181 delalloc/allocations */
1155 u64 bytes_readonly; /* total bytes that are read only */ 1182 u64 bytes_readonly; /* total bytes that are read only */
1156 1183
1184 u64 max_extent_size; /* This will hold the maximum extent size of
1185 the space info if we had an ENOSPC in the
1186 allocator. */
1187
1157 unsigned int full:1; /* indicates that we cannot allocate any more 1188 unsigned int full:1; /* indicates that we cannot allocate any more
1158 chunks for this space */ 1189 chunks for this space */
1159 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ 1190 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
@@ -1228,6 +1259,9 @@ struct btrfs_free_cluster {
1228 /* first extent starting offset */ 1259 /* first extent starting offset */
1229 u64 window_start; 1260 u64 window_start;
1230 1261
1262 /* We did a full search and couldn't create a cluster */
1263 bool fragmented;
1264
1231 struct btrfs_block_group_cache *block_group; 1265 struct btrfs_block_group_cache *block_group;
1232 /* 1266 /*
1233 * when a cluster is allocated from a block group, we put the 1267 * when a cluster is allocated from a block group, we put the
@@ -1943,6 +1977,9 @@ struct btrfs_root {
1943 int send_in_progress; 1977 int send_in_progress;
1944 struct btrfs_subvolume_writers *subv_writers; 1978 struct btrfs_subvolume_writers *subv_writers;
1945 atomic_t will_be_snapshoted; 1979 atomic_t will_be_snapshoted;
1980
1981 /* For qgroup metadata space reserve */
1982 atomic_t qgroup_meta_rsv;
1946}; 1983};
1947 1984
1948struct btrfs_ioctl_defrag_range_args { 1985struct btrfs_ioctl_defrag_range_args {
@@ -2145,6 +2182,8 @@ struct btrfs_ioctl_defrag_range_args {
2145#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21) 2182#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
2146#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22) 2183#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR (1 << 22)
2147#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) 2184#define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23)
2185#define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24)
2186#define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25)
2148 2187
2149#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) 2188#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
2150#define BTRFS_DEFAULT_MAX_INLINE (8192) 2189#define BTRFS_DEFAULT_MAX_INLINE (8192)
@@ -2169,6 +2208,18 @@ struct btrfs_ioctl_defrag_range_args {
2169 btrfs_clear_opt(root->fs_info->mount_opt, opt); \ 2208 btrfs_clear_opt(root->fs_info->mount_opt, opt); \
2170} 2209}
2171 2210
2211#ifdef CONFIG_BTRFS_DEBUG
2212static inline int
2213btrfs_should_fragment_free_space(struct btrfs_root *root,
2214 struct btrfs_block_group_cache *block_group)
2215{
2216 return (btrfs_test_opt(root, FRAGMENT_METADATA) &&
2217 block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
2218 (btrfs_test_opt(root, FRAGMENT_DATA) &&
2219 block_group->flags & BTRFS_BLOCK_GROUP_DATA);
2220}
2221#endif
2222
2172/* 2223/*
2173 * Requests for changes that need to be done during transaction commit. 2224 * Requests for changes that need to be done during transaction commit.
2174 * 2225 *
@@ -3379,7 +3430,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3379int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 3430int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
3380 struct btrfs_root *root, 3431 struct btrfs_root *root,
3381 u64 root_objectid, u64 owner, 3432 u64 root_objectid, u64 owner,
3382 u64 offset, struct btrfs_key *ins); 3433 u64 offset, u64 ram_bytes,
3434 struct btrfs_key *ins);
3383int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, 3435int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
3384 struct btrfs_root *root, 3436 struct btrfs_root *root,
3385 u64 root_objectid, u64 owner, u64 offset, 3437 u64 root_objectid, u64 owner, u64 offset,
@@ -3398,7 +3450,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
3398int btrfs_free_extent(struct btrfs_trans_handle *trans, 3450int btrfs_free_extent(struct btrfs_trans_handle *trans,
3399 struct btrfs_root *root, 3451 struct btrfs_root *root,
3400 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 3452 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
3401 u64 owner, u64 offset, int no_quota); 3453 u64 owner, u64 offset);
3402 3454
3403int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len, 3455int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len,
3404 int delalloc); 3456 int delalloc);
@@ -3411,7 +3463,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3411int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 3463int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
3412 struct btrfs_root *root, 3464 struct btrfs_root *root,
3413 u64 bytenr, u64 num_bytes, u64 parent, 3465 u64 bytenr, u64 num_bytes, u64 parent,
3414 u64 root_objectid, u64 owner, u64 offset, int no_quota); 3466 u64 root_objectid, u64 owner, u64 offset);
3415 3467
3416int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans, 3468int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans,
3417 struct btrfs_root *root); 3469 struct btrfs_root *root);
@@ -3449,8 +3501,11 @@ enum btrfs_reserve_flush_enum {
3449 BTRFS_RESERVE_FLUSH_ALL, 3501 BTRFS_RESERVE_FLUSH_ALL,
3450}; 3502};
3451 3503
3452int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes); 3504int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len);
3453void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 3505int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes);
3506void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len);
3507void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
3508 u64 len);
3454void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3509void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3455 struct btrfs_root *root); 3510 struct btrfs_root *root);
3456void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); 3511void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
@@ -3466,8 +3521,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
3466 u64 qgroup_reserved); 3521 u64 qgroup_reserved);
3467int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes); 3522int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
3468void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes); 3523void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
3469int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes); 3524int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len);
3470void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes); 3525void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len);
3471void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); 3526void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
3472struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root, 3527struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
3473 unsigned short type); 3528 unsigned short type);
@@ -4004,8 +4059,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
4004/* sysfs.c */ 4059/* sysfs.c */
4005int btrfs_init_sysfs(void); 4060int btrfs_init_sysfs(void);
4006void btrfs_exit_sysfs(void); 4061void btrfs_exit_sysfs(void);
4007int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info); 4062int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info);
4008void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info); 4063void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info);
4009 4064
4010/* xattr.c */ 4065/* xattr.c */
4011ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 4066ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -4039,14 +4094,102 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
4039#define btrfs_info(fs_info, fmt, args...) \ 4094#define btrfs_info(fs_info, fmt, args...) \
4040 btrfs_printk(fs_info, KERN_INFO fmt, ##args) 4095 btrfs_printk(fs_info, KERN_INFO fmt, ##args)
4041 4096
4097/*
4098 * Wrappers that use printk_in_rcu
4099 */
4100#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
4101 btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
4102#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
4103 btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
4104#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
4105 btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
4106#define btrfs_err_in_rcu(fs_info, fmt, args...) \
4107 btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
4108#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
4109 btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
4110#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
4111 btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
4112#define btrfs_info_in_rcu(fs_info, fmt, args...) \
4113 btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
4114
4115/*
4116 * Wrappers that use a ratelimited printk_in_rcu
4117 */
4118#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
4119 btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
4120#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
4121 btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
4122#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
4123 btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
4124#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
4125 btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
4126#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
4127 btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
4128#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
4129 btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
4130#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
4131 btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
4132
4133/*
4134 * Wrappers that use a ratelimited printk
4135 */
4136#define btrfs_emerg_rl(fs_info, fmt, args...) \
4137 btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
4138#define btrfs_alert_rl(fs_info, fmt, args...) \
4139 btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
4140#define btrfs_crit_rl(fs_info, fmt, args...) \
4141 btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
4142#define btrfs_err_rl(fs_info, fmt, args...) \
4143 btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
4144#define btrfs_warn_rl(fs_info, fmt, args...) \
4145 btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
4146#define btrfs_notice_rl(fs_info, fmt, args...) \
4147 btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
4148#define btrfs_info_rl(fs_info, fmt, args...) \
4149 btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
4042#ifdef DEBUG 4150#ifdef DEBUG
4043#define btrfs_debug(fs_info, fmt, args...) \ 4151#define btrfs_debug(fs_info, fmt, args...) \
4044 btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) 4152 btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
4153#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
4154 btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
4155#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
4156 btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
4157#define btrfs_debug_rl(fs_info, fmt, args...) \
4158 btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
4045#else 4159#else
4046#define btrfs_debug(fs_info, fmt, args...) \ 4160#define btrfs_debug(fs_info, fmt, args...) \
4047 no_printk(KERN_DEBUG fmt, ##args) 4161 no_printk(KERN_DEBUG fmt, ##args)
4162#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
4163 no_printk(KERN_DEBUG fmt, ##args)
4164#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
4165 no_printk(KERN_DEBUG fmt, ##args)
4166#define btrfs_debug_rl(fs_info, fmt, args...) \
4167 no_printk(KERN_DEBUG fmt, ##args)
4048#endif 4168#endif
4049 4169
4170#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
4171do { \
4172 rcu_read_lock(); \
4173 btrfs_printk(fs_info, fmt, ##args); \
4174 rcu_read_unlock(); \
4175} while (0)
4176
4177#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
4178do { \
4179 static DEFINE_RATELIMIT_STATE(_rs, \
4180 DEFAULT_RATELIMIT_INTERVAL, \
4181 DEFAULT_RATELIMIT_BURST); \
4182 if (__ratelimit(&_rs)) \
4183 btrfs_printk(fs_info, fmt, ##args); \
4184} while (0)
4185
4186#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
4187do { \
4188 rcu_read_lock(); \
4189 btrfs_printk_ratelimited(fs_info, fmt, ##args); \
4190 rcu_read_unlock(); \
4191} while (0)
4192
4050#ifdef CONFIG_BTRFS_ASSERT 4193#ifdef CONFIG_BTRFS_ASSERT
4051 4194
4052__cold 4195__cold
@@ -4127,14 +4270,7 @@ do { \
4127 __LINE__, (errno)); \ 4270 __LINE__, (errno)); \
4128} while (0) 4271} while (0)
4129 4272
4130#define btrfs_std_error(fs_info, errno) \ 4273#define btrfs_std_error(fs_info, errno, fmt, args...) \
4131do { \
4132 if ((errno)) \
4133 __btrfs_std_error((fs_info), __func__, \
4134 __LINE__, (errno), NULL); \
4135} while (0)
4136
4137#define btrfs_error(fs_info, errno, fmt, args...) \
4138do { \ 4274do { \
4139 __btrfs_std_error((fs_info), __func__, __LINE__, \ 4275 __btrfs_std_error((fs_info), __func__, __LINE__, \
4140 (errno), fmt, ##args); \ 4276 (errno), fmt, ##args); \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index a2ae42720a6a..e0941fbb913c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -463,6 +463,10 @@ static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
463static void finish_one_item(struct btrfs_delayed_root *delayed_root) 463static void finish_one_item(struct btrfs_delayed_root *delayed_root)
464{ 464{
465 int seq = atomic_inc_return(&delayed_root->items_seq); 465 int seq = atomic_inc_return(&delayed_root->items_seq);
466
467 /*
468 * atomic_dec_return implies a barrier for waitqueue_active
469 */
466 if ((atomic_dec_return(&delayed_root->items) < 470 if ((atomic_dec_return(&delayed_root->items) <
467 BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) && 471 BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
468 waitqueue_active(&delayed_root->wait)) 472 waitqueue_active(&delayed_root->wait))
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ac3e81da6d4e..e06dd75ad13f 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -197,6 +197,119 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
197 trans->delayed_ref_updates--; 197 trans->delayed_ref_updates--;
198} 198}
199 199
200static bool merge_ref(struct btrfs_trans_handle *trans,
201 struct btrfs_delayed_ref_root *delayed_refs,
202 struct btrfs_delayed_ref_head *head,
203 struct btrfs_delayed_ref_node *ref,
204 u64 seq)
205{
206 struct btrfs_delayed_ref_node *next;
207 bool done = false;
208
209 next = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
210 list);
211 while (!done && &next->list != &head->ref_list) {
212 int mod;
213 struct btrfs_delayed_ref_node *next2;
214
215 next2 = list_next_entry(next, list);
216
217 if (next == ref)
218 goto next;
219
220 if (seq && next->seq >= seq)
221 goto next;
222
223 if (next->type != ref->type)
224 goto next;
225
226 if ((ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
227 ref->type == BTRFS_SHARED_BLOCK_REF_KEY) &&
228 comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref),
229 btrfs_delayed_node_to_tree_ref(next),
230 ref->type))
231 goto next;
232 if ((ref->type == BTRFS_EXTENT_DATA_REF_KEY ||
233 ref->type == BTRFS_SHARED_DATA_REF_KEY) &&
234 comp_data_refs(btrfs_delayed_node_to_data_ref(ref),
235 btrfs_delayed_node_to_data_ref(next)))
236 goto next;
237
238 if (ref->action == next->action) {
239 mod = next->ref_mod;
240 } else {
241 if (ref->ref_mod < next->ref_mod) {
242 swap(ref, next);
243 done = true;
244 }
245 mod = -next->ref_mod;
246 }
247
248 drop_delayed_ref(trans, delayed_refs, head, next);
249 ref->ref_mod += mod;
250 if (ref->ref_mod == 0) {
251 drop_delayed_ref(trans, delayed_refs, head, ref);
252 done = true;
253 } else {
254 /*
255 * Can't have multiples of the same ref on a tree block.
256 */
257 WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
258 ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
259 }
260next:
261 next = next2;
262 }
263
264 return done;
265}
266
267void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
268 struct btrfs_fs_info *fs_info,
269 struct btrfs_delayed_ref_root *delayed_refs,
270 struct btrfs_delayed_ref_head *head)
271{
272 struct btrfs_delayed_ref_node *ref;
273 u64 seq = 0;
274
275 assert_spin_locked(&head->lock);
276
277 if (list_empty(&head->ref_list))
278 return;
279
280 /* We don't have too many refs to merge for data. */
281 if (head->is_data)
282 return;
283
284 spin_lock(&fs_info->tree_mod_seq_lock);
285 if (!list_empty(&fs_info->tree_mod_seq_list)) {
286 struct seq_list *elem;
287
288 elem = list_first_entry(&fs_info->tree_mod_seq_list,
289 struct seq_list, list);
290 seq = elem->seq;
291 }
292 spin_unlock(&fs_info->tree_mod_seq_lock);
293
294 ref = list_first_entry(&head->ref_list, struct btrfs_delayed_ref_node,
295 list);
296 while (&ref->list != &head->ref_list) {
297 if (seq && ref->seq >= seq)
298 goto next;
299
300 if (merge_ref(trans, delayed_refs, head, ref, seq)) {
301 if (list_empty(&head->ref_list))
302 break;
303 ref = list_first_entry(&head->ref_list,
304 struct btrfs_delayed_ref_node,
305 list);
306 continue;
307 }
308next:
309 ref = list_next_entry(ref, list);
310 }
311}
312
200int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, 313int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
201 struct btrfs_delayed_ref_root *delayed_refs, 314 struct btrfs_delayed_ref_root *delayed_refs,
202 u64 seq) 315 u64 seq)
@@ -292,8 +405,7 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans,
292 exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, 405 exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node,
293 list); 406 list);
294 /* No need to compare bytenr nor is_head */ 407 /* No need to compare bytenr nor is_head */
295 if (exist->type != ref->type || exist->no_quota != ref->no_quota || 408 if (exist->type != ref->type || exist->seq != ref->seq)
296 exist->seq != ref->seq)
297 goto add_tail; 409 goto add_tail;
298 410
299 if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || 411 if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -423,7 +535,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
423 struct btrfs_trans_handle *trans, 535 struct btrfs_trans_handle *trans,
424 struct btrfs_delayed_ref_node *ref, 536 struct btrfs_delayed_ref_node *ref,
425 struct btrfs_qgroup_extent_record *qrecord, 537 struct btrfs_qgroup_extent_record *qrecord,
426 u64 bytenr, u64 num_bytes, int action, int is_data) 538 u64 bytenr, u64 num_bytes, u64 ref_root, u64 reserved,
539 int action, int is_data)
427{ 540{
428 struct btrfs_delayed_ref_head *existing; 541 struct btrfs_delayed_ref_head *existing;
429 struct btrfs_delayed_ref_head *head_ref = NULL; 542 struct btrfs_delayed_ref_head *head_ref = NULL;
@@ -432,6 +545,9 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
432 int count_mod = 1; 545 int count_mod = 1;
433 int must_insert_reserved = 0; 546 int must_insert_reserved = 0;
434 547
548 /* If reserved is provided, it must be a data extent. */
549 BUG_ON(!is_data && reserved);
550
435 /* 551 /*
436 * the head node stores the sum of all the mods, so dropping a ref 552 * the head node stores the sum of all the mods, so dropping a ref
437 * should drop the sum in the head node by one. 553 * should drop the sum in the head node by one.
@@ -476,9 +592,16 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
476 INIT_LIST_HEAD(&head_ref->ref_list); 592 INIT_LIST_HEAD(&head_ref->ref_list);
477 head_ref->processing = 0; 593 head_ref->processing = 0;
478 head_ref->total_ref_mod = count_mod; 594 head_ref->total_ref_mod = count_mod;
595 head_ref->qgroup_reserved = 0;
596 head_ref->qgroup_ref_root = 0;
479 597
480 /* Record qgroup extent info if provided */ 598 /* Record qgroup extent info if provided */
481 if (qrecord) { 599 if (qrecord) {
600 if (ref_root && reserved) {
601 head_ref->qgroup_ref_root = ref_root;
602 head_ref->qgroup_reserved = reserved;
603 }
604
482 qrecord->bytenr = bytenr; 605 qrecord->bytenr = bytenr;
483 qrecord->num_bytes = num_bytes; 606 qrecord->num_bytes = num_bytes;
484 qrecord->old_roots = NULL; 607 qrecord->old_roots = NULL;
@@ -497,6 +620,8 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
497 existing = htree_insert(&delayed_refs->href_root, 620 existing = htree_insert(&delayed_refs->href_root,
498 &head_ref->href_node); 621 &head_ref->href_node);
499 if (existing) { 622 if (existing) {
623 WARN_ON(ref_root && reserved && existing->qgroup_ref_root
624 && existing->qgroup_reserved);
500 update_existing_head_ref(delayed_refs, &existing->node, ref); 625 update_existing_head_ref(delayed_refs, &existing->node, ref);
501 /* 626 /*
502 * we've updated the existing ref, free the newly 627 * we've updated the existing ref, free the newly
@@ -524,7 +649,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
524 struct btrfs_delayed_ref_head *head_ref, 649 struct btrfs_delayed_ref_head *head_ref,
525 struct btrfs_delayed_ref_node *ref, u64 bytenr, 650 struct btrfs_delayed_ref_node *ref, u64 bytenr,
526 u64 num_bytes, u64 parent, u64 ref_root, int level, 651 u64 num_bytes, u64 parent, u64 ref_root, int level,
527 int action, int no_quota) 652 int action)
528{ 653{
529 struct btrfs_delayed_tree_ref *full_ref; 654 struct btrfs_delayed_tree_ref *full_ref;
530 struct btrfs_delayed_ref_root *delayed_refs; 655 struct btrfs_delayed_ref_root *delayed_refs;
@@ -546,7 +671,6 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
546 ref->action = action; 671 ref->action = action;
547 ref->is_head = 0; 672 ref->is_head = 0;
548 ref->in_tree = 1; 673 ref->in_tree = 1;
549 ref->no_quota = no_quota;
550 ref->seq = seq; 674 ref->seq = seq;
551 675
552 full_ref = btrfs_delayed_node_to_tree_ref(ref); 676 full_ref = btrfs_delayed_node_to_tree_ref(ref);
@@ -579,7 +703,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
579 struct btrfs_delayed_ref_head *head_ref, 703 struct btrfs_delayed_ref_head *head_ref,
580 struct btrfs_delayed_ref_node *ref, u64 bytenr, 704 struct btrfs_delayed_ref_node *ref, u64 bytenr,
581 u64 num_bytes, u64 parent, u64 ref_root, u64 owner, 705 u64 num_bytes, u64 parent, u64 ref_root, u64 owner,
582 u64 offset, int action, int no_quota) 706 u64 offset, int action)
583{ 707{
584 struct btrfs_delayed_data_ref *full_ref; 708 struct btrfs_delayed_data_ref *full_ref;
585 struct btrfs_delayed_ref_root *delayed_refs; 709 struct btrfs_delayed_ref_root *delayed_refs;
@@ -602,7 +726,6 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
602 ref->action = action; 726 ref->action = action;
603 ref->is_head = 0; 727 ref->is_head = 0;
604 ref->in_tree = 1; 728 ref->in_tree = 1;
605 ref->no_quota = no_quota;
606 ref->seq = seq; 729 ref->seq = seq;
607 730
608 full_ref = btrfs_delayed_node_to_data_ref(ref); 731 full_ref = btrfs_delayed_node_to_data_ref(ref);
@@ -633,17 +756,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
633 struct btrfs_trans_handle *trans, 756 struct btrfs_trans_handle *trans,
634 u64 bytenr, u64 num_bytes, u64 parent, 757 u64 bytenr, u64 num_bytes, u64 parent,
635 u64 ref_root, int level, int action, 758 u64 ref_root, int level, int action,
636 struct btrfs_delayed_extent_op *extent_op, 759 struct btrfs_delayed_extent_op *extent_op)
637 int no_quota)
638{ 760{
639 struct btrfs_delayed_tree_ref *ref; 761 struct btrfs_delayed_tree_ref *ref;
640 struct btrfs_delayed_ref_head *head_ref; 762 struct btrfs_delayed_ref_head *head_ref;
641 struct btrfs_delayed_ref_root *delayed_refs; 763 struct btrfs_delayed_ref_root *delayed_refs;
642 struct btrfs_qgroup_extent_record *record = NULL; 764 struct btrfs_qgroup_extent_record *record = NULL;
643 765
644 if (!is_fstree(ref_root) || !fs_info->quota_enabled)
645 no_quota = 0;
646
647 BUG_ON(extent_op && extent_op->is_data); 766 BUG_ON(extent_op && extent_op->is_data);
648 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); 767 ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
649 if (!ref) 768 if (!ref)
@@ -669,11 +788,10 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
669 * the spin lock 788 * the spin lock
670 */ 789 */
671 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 790 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
672 bytenr, num_bytes, action, 0); 791 bytenr, num_bytes, 0, 0, action, 0);
673 792
674 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, 793 add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr,
675 num_bytes, parent, ref_root, level, action, 794 num_bytes, parent, ref_root, level, action);
676 no_quota);
677 spin_unlock(&delayed_refs->lock); 795 spin_unlock(&delayed_refs->lock);
678 796
679 return 0; 797 return 0;
@@ -693,18 +811,14 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
693 struct btrfs_trans_handle *trans, 811 struct btrfs_trans_handle *trans,
694 u64 bytenr, u64 num_bytes, 812 u64 bytenr, u64 num_bytes,
695 u64 parent, u64 ref_root, 813 u64 parent, u64 ref_root,
696 u64 owner, u64 offset, int action, 814 u64 owner, u64 offset, u64 reserved, int action,
697 struct btrfs_delayed_extent_op *extent_op, 815 struct btrfs_delayed_extent_op *extent_op)
698 int no_quota)
699{ 816{
700 struct btrfs_delayed_data_ref *ref; 817 struct btrfs_delayed_data_ref *ref;
701 struct btrfs_delayed_ref_head *head_ref; 818 struct btrfs_delayed_ref_head *head_ref;
702 struct btrfs_delayed_ref_root *delayed_refs; 819 struct btrfs_delayed_ref_root *delayed_refs;
703 struct btrfs_qgroup_extent_record *record = NULL; 820 struct btrfs_qgroup_extent_record *record = NULL;
704 821
705 if (!is_fstree(ref_root) || !fs_info->quota_enabled)
706 no_quota = 0;
707
708 BUG_ON(extent_op && !extent_op->is_data); 822 BUG_ON(extent_op && !extent_op->is_data);
709 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS); 823 ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
710 if (!ref) 824 if (!ref)
@@ -736,16 +850,44 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
736 * the spin lock 850 * the spin lock
737 */ 851 */
738 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, 852 head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record,
739 bytenr, num_bytes, action, 1); 853 bytenr, num_bytes, ref_root, reserved,
854 action, 1);
740 855
741 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, 856 add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr,
742 num_bytes, parent, ref_root, owner, offset, 857 num_bytes, parent, ref_root, owner, offset,
743 action, no_quota); 858 action);
744 spin_unlock(&delayed_refs->lock); 859 spin_unlock(&delayed_refs->lock);
745 860
746 return 0; 861 return 0;
747} 862}
748 863
864int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
865 struct btrfs_trans_handle *trans,
866 u64 ref_root, u64 bytenr, u64 num_bytes)
867{
868 struct btrfs_delayed_ref_root *delayed_refs;
869 struct btrfs_delayed_ref_head *ref_head;
870 int ret = 0;
871
872 if (!fs_info->quota_enabled || !is_fstree(ref_root))
873 return 0;
874
875 delayed_refs = &trans->transaction->delayed_refs;
876
877 spin_lock(&delayed_refs->lock);
878 ref_head = find_ref_head(&delayed_refs->href_root, bytenr, 0);
879 if (!ref_head) {
880 ret = -ENOENT;
881 goto out;
882 }
883 WARN_ON(ref_head->qgroup_reserved || ref_head->qgroup_ref_root);
884 ref_head->qgroup_ref_root = ref_root;
885 ref_head->qgroup_reserved = num_bytes;
886out:
887 spin_unlock(&delayed_refs->lock);
888 return ret;
889}
890
749int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, 891int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
750 struct btrfs_trans_handle *trans, 892 struct btrfs_trans_handle *trans,
751 u64 bytenr, u64 num_bytes, 893 u64 bytenr, u64 num_bytes,
@@ -764,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
764 spin_lock(&delayed_refs->lock); 906 spin_lock(&delayed_refs->lock);
765 907
766 add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, 908 add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr,
767 num_bytes, BTRFS_UPDATE_DELAYED_HEAD, 909 num_bytes, 0, 0, BTRFS_UPDATE_DELAYED_HEAD,
768 extent_op->is_data); 910 extent_op->is_data);
769 911
770 spin_unlock(&delayed_refs->lock); 912 spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 13fb5e6090fe..00ed02cbf3e9 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -68,7 +68,6 @@ struct btrfs_delayed_ref_node {
68 68
69 unsigned int action:8; 69 unsigned int action:8;
70 unsigned int type:8; 70 unsigned int type:8;
71 unsigned int no_quota:1;
72 /* is this node still in the rbtree? */ 71 /* is this node still in the rbtree? */
73 unsigned int is_head:1; 72 unsigned int is_head:1;
74 unsigned int in_tree:1; 73 unsigned int in_tree:1;
@@ -113,6 +112,17 @@ struct btrfs_delayed_ref_head {
113 int total_ref_mod; 112 int total_ref_mod;
114 113
115 /* 114 /*
115 * For qgroup reserved space freeing.
116 *
117 * ref_root and reserved will be recorded after
118 * BTRFS_ADD_DELAYED_EXTENT is called.
119 * And will be used to free reserved qgroup space at
120 * run_delayed_refs() time.
121 */
122 u64 qgroup_ref_root;
123 u64 qgroup_reserved;
124
125 /*
116 * when a new extent is allocated, it is just reserved in memory 126 * when a new extent is allocated, it is just reserved in memory
117 * The actual extent isn't inserted into the extent allocation tree 127 * The actual extent isn't inserted into the extent allocation tree
118 * until the delayed ref is processed. must_insert_reserved is 128 * until the delayed ref is processed. must_insert_reserved is
@@ -233,15 +243,16 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
233 struct btrfs_trans_handle *trans, 243 struct btrfs_trans_handle *trans,
234 u64 bytenr, u64 num_bytes, u64 parent, 244 u64 bytenr, u64 num_bytes, u64 parent,
235 u64 ref_root, int level, int action, 245 u64 ref_root, int level, int action,
236 struct btrfs_delayed_extent_op *extent_op, 246 struct btrfs_delayed_extent_op *extent_op);
237 int no_quota);
238int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, 247int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
239 struct btrfs_trans_handle *trans, 248 struct btrfs_trans_handle *trans,
240 u64 bytenr, u64 num_bytes, 249 u64 bytenr, u64 num_bytes,
241 u64 parent, u64 ref_root, 250 u64 parent, u64 ref_root,
242 u64 owner, u64 offset, int action, 251 u64 owner, u64 offset, u64 reserved, int action,
243 struct btrfs_delayed_extent_op *extent_op, 252 struct btrfs_delayed_extent_op *extent_op);
244 int no_quota); 253int btrfs_add_delayed_qgroup_reserve(struct btrfs_fs_info *fs_info,
254 struct btrfs_trans_handle *trans,
255 u64 ref_root, u64 bytenr, u64 num_bytes);
245int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, 256int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
246 struct btrfs_trans_handle *trans, 257 struct btrfs_trans_handle *trans,
247 u64 bytenr, u64 num_bytes, 258 u64 bytenr, u64 num_bytes,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index e54dd5905cee..1e668fb7dd4c 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -327,19 +327,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
327 args->start.tgtdev_name[0] == '\0') 327 args->start.tgtdev_name[0] == '\0')
328 return -EINVAL; 328 return -EINVAL;
329 329
330 /*
331 * Here we commit the transaction to make sure commit_total_bytes
332 * of all the devices are updated.
333 */
334 trans = btrfs_attach_transaction(root);
335 if (!IS_ERR(trans)) {
336 ret = btrfs_commit_transaction(trans, root);
337 if (ret)
338 return ret;
339 } else if (PTR_ERR(trans) != -ENOENT) {
340 return PTR_ERR(trans);
341 }
342
343 /* the disk copy procedure reuses the scrub code */ 330 /* the disk copy procedure reuses the scrub code */
344 mutex_lock(&fs_info->volume_mutex); 331 mutex_lock(&fs_info->volume_mutex);
345 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, 332 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
@@ -356,6 +343,19 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
356 if (ret) 343 if (ret)
357 return ret; 344 return ret;
358 345
346 /*
347 * Here we commit the transaction to make sure commit_total_bytes
348 * of all the devices are updated.
349 */
350 trans = btrfs_attach_transaction(root);
351 if (!IS_ERR(trans)) {
352 ret = btrfs_commit_transaction(trans, root);
353 if (ret)
354 return ret;
355 } else if (PTR_ERR(trans) != -ENOENT) {
356 return PTR_ERR(trans);
357 }
358
359 btrfs_dev_replace_lock(dev_replace); 359 btrfs_dev_replace_lock(dev_replace);
360 switch (dev_replace->replace_state) { 360 switch (dev_replace->replace_state) {
361 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: 361 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -375,12 +375,8 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
375 WARN_ON(!tgt_device); 375 WARN_ON(!tgt_device);
376 dev_replace->tgtdev = tgt_device; 376 dev_replace->tgtdev = tgt_device;
377 377
378 ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); 378 btrfs_info_in_rcu(root->fs_info,
379 if (ret) 379 "dev_replace from %s (devid %llu) to %s started",
380 btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
381
382 printk_in_rcu(KERN_INFO
383 "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
384 src_device->missing ? "<missing disk>" : 380 src_device->missing ? "<missing disk>" :
385 rcu_str_deref(src_device->name), 381 rcu_str_deref(src_device->name),
386 src_device->devid, 382 src_device->devid,
@@ -401,6 +397,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
401 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR; 397 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
402 btrfs_dev_replace_unlock(dev_replace); 398 btrfs_dev_replace_unlock(dev_replace);
403 399
400 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
401 if (ret)
402 btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret);
403
404 btrfs_wait_ordered_roots(root->fs_info, -1); 404 btrfs_wait_ordered_roots(root->fs_info, -1);
405 405
406 /* force writing the updated state information to disk */ 406 /* force writing the updated state information to disk */
@@ -454,8 +454,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
454static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info) 454static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
455{ 455{
456 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state); 456 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
457 if (waitqueue_active(&fs_info->replace_wait)) 457 wake_up(&fs_info->replace_wait);
458 wake_up(&fs_info->replace_wait);
459} 458}
460 459
461static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, 460static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -523,8 +522,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
523 src_device, 522 src_device,
524 tgt_device); 523 tgt_device);
525 } else { 524 } else {
526 printk_in_rcu(KERN_ERR 525 btrfs_err_in_rcu(root->fs_info,
527 "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n", 526 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
528 src_device->missing ? "<missing disk>" : 527 src_device->missing ? "<missing disk>" :
529 rcu_str_deref(src_device->name), 528 rcu_str_deref(src_device->name),
530 src_device->devid, 529 src_device->devid,
@@ -540,8 +539,8 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
540 return scrub_ret; 539 return scrub_ret;
541 } 540 }
542 541
543 printk_in_rcu(KERN_INFO 542 btrfs_info_in_rcu(root->fs_info,
544 "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", 543 "dev_replace from %s (devid %llu) to %s finished",
545 src_device->missing ? "<missing disk>" : 544 src_device->missing ? "<missing disk>" :
546 rcu_str_deref(src_device->name), 545 rcu_str_deref(src_device->name),
547 src_device->devid, 546 src_device->devid,
@@ -586,7 +585,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
586 mutex_unlock(&uuid_mutex); 585 mutex_unlock(&uuid_mutex);
587 586
588 /* replace the sysfs entry */ 587 /* replace the sysfs entry */
589 btrfs_kobj_rm_device(fs_info->fs_devices, src_device); 588 btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
590 btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); 589 btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
591 590
592 /* write back the superblocks */ 591 /* write back the superblocks */
@@ -809,8 +808,8 @@ static int btrfs_dev_replace_kthread(void *data)
809 progress = status_args->status.progress_1000; 808 progress = status_args->status.progress_1000;
810 kfree(status_args); 809 kfree(status_args);
811 progress = div_u64(progress, 10); 810 progress = div_u64(progress, 10);
812 printk_in_rcu(KERN_INFO 811 btrfs_info_in_rcu(fs_info,
813 "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n", 812 "continuing dev_replace from %s (devid %llu) to %s @%u%%",
814 dev_replace->srcdev->missing ? "<missing disk>" : 813 dev_replace->srcdev->missing ? "<missing disk>" :
815 rcu_str_deref(dev_replace->srcdev->name), 814 rcu_str_deref(dev_replace->srcdev->name),
816 dev_replace->srcdev->devid, 815 dev_replace->srcdev->devid,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1e60d00d4ea7..2d4667594681 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -319,9 +319,9 @@ static int csum_tree_block(struct btrfs_fs_info *fs_info,
319 memcpy(&found, result, csum_size); 319 memcpy(&found, result, csum_size);
320 320
321 read_extent_buffer(buf, &val, 0, csum_size); 321 read_extent_buffer(buf, &val, 0, csum_size);
322 printk_ratelimited(KERN_WARNING 322 btrfs_warn_rl(fs_info,
323 "BTRFS: %s checksum verify failed on %llu wanted %X found %X " 323 "%s checksum verify failed on %llu wanted %X found %X "
324 "level %d\n", 324 "level %d",
325 fs_info->sb->s_id, buf->start, 325 fs_info->sb->s_id, buf->start,
326 val, found, btrfs_header_level(buf)); 326 val, found, btrfs_header_level(buf));
327 if (result != (char *)&inline_result) 327 if (result != (char *)&inline_result)
@@ -368,9 +368,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
368 ret = 0; 368 ret = 0;
369 goto out; 369 goto out;
370 } 370 }
371 printk_ratelimited(KERN_ERR 371 btrfs_err_rl(eb->fs_info,
372 "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", 372 "parent transid verify failed on %llu wanted %llu found %llu",
373 eb->fs_info->sb->s_id, eb->start, 373 eb->start,
374 parent_transid, btrfs_header_generation(eb)); 374 parent_transid, btrfs_header_generation(eb));
375 ret = 1; 375 ret = 1;
376 376
@@ -629,15 +629,14 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
629 629
630 found_start = btrfs_header_bytenr(eb); 630 found_start = btrfs_header_bytenr(eb);
631 if (found_start != eb->start) { 631 if (found_start != eb->start) {
632 printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start " 632 btrfs_err_rl(eb->fs_info, "bad tree block start %llu %llu",
633 "%llu %llu\n", 633 found_start, eb->start);
634 eb->fs_info->sb->s_id, found_start, eb->start);
635 ret = -EIO; 634 ret = -EIO;
636 goto err; 635 goto err;
637 } 636 }
638 if (check_tree_block_fsid(root->fs_info, eb)) { 637 if (check_tree_block_fsid(root->fs_info, eb)) {
639 printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n", 638 btrfs_err_rl(eb->fs_info, "bad fsid on block %llu",
640 eb->fs_info->sb->s_id, eb->start); 639 eb->start);
641 ret = -EIO; 640 ret = -EIO;
642 goto err; 641 goto err;
643 } 642 }
@@ -802,6 +801,9 @@ static void run_one_async_done(struct btrfs_work *work)
802 limit = btrfs_async_submit_limit(fs_info); 801 limit = btrfs_async_submit_limit(fs_info);
803 limit = limit * 2 / 3; 802 limit = limit * 2 / 3;
804 803
804 /*
805 * atomic_dec_return implies a barrier for waitqueue_active
806 */
805 if (atomic_dec_return(&fs_info->nr_async_submits) < limit && 807 if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
806 waitqueue_active(&fs_info->async_submit_wait)) 808 waitqueue_active(&fs_info->async_submit_wait))
807 wake_up(&fs_info->async_submit_wait); 809 wake_up(&fs_info->async_submit_wait);
@@ -1265,6 +1267,7 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
1265 atomic_set(&root->orphan_inodes, 0); 1267 atomic_set(&root->orphan_inodes, 0);
1266 atomic_set(&root->refs, 1); 1268 atomic_set(&root->refs, 1);
1267 atomic_set(&root->will_be_snapshoted, 0); 1269 atomic_set(&root->will_be_snapshoted, 0);
1270 atomic_set(&root->qgroup_meta_rsv, 0);
1268 root->log_transid = 0; 1271 root->log_transid = 0;
1269 root->log_transid_committed = -1; 1272 root->log_transid_committed = -1;
1270 root->last_log_commit = 0; 1273 root->last_log_commit = 0;
@@ -1759,6 +1762,7 @@ static int cleaner_kthread(void *arg)
1759 int again; 1762 int again;
1760 struct btrfs_trans_handle *trans; 1763 struct btrfs_trans_handle *trans;
1761 1764
1765 set_freezable();
1762 do { 1766 do {
1763 again = 0; 1767 again = 0;
1764 1768
@@ -2348,8 +2352,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2348 u64 bytenr = btrfs_super_log_root(disk_super); 2352 u64 bytenr = btrfs_super_log_root(disk_super);
2349 2353
2350 if (fs_devices->rw_devices == 0) { 2354 if (fs_devices->rw_devices == 0) {
2351 printk(KERN_WARNING "BTRFS: log replay required " 2355 btrfs_warn(fs_info, "log replay required on RO media");
2352 "on RO media\n");
2353 return -EIO; 2356 return -EIO;
2354 } 2357 }
2355 2358
@@ -2364,12 +2367,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2364 log_tree_root->node = read_tree_block(tree_root, bytenr, 2367 log_tree_root->node = read_tree_block(tree_root, bytenr,
2365 fs_info->generation + 1); 2368 fs_info->generation + 1);
2366 if (IS_ERR(log_tree_root->node)) { 2369 if (IS_ERR(log_tree_root->node)) {
2367 printk(KERN_ERR "BTRFS: failed to read log tree\n"); 2370 btrfs_warn(fs_info, "failed to read log tree");
2368 ret = PTR_ERR(log_tree_root->node); 2371 ret = PTR_ERR(log_tree_root->node);
2369 kfree(log_tree_root); 2372 kfree(log_tree_root);
2370 return ret; 2373 return ret;
2371 } else if (!extent_buffer_uptodate(log_tree_root->node)) { 2374 } else if (!extent_buffer_uptodate(log_tree_root->node)) {
2372 printk(KERN_ERR "BTRFS: failed to read log tree\n"); 2375 btrfs_err(fs_info, "failed to read log tree");
2373 free_extent_buffer(log_tree_root->node); 2376 free_extent_buffer(log_tree_root->node);
2374 kfree(log_tree_root); 2377 kfree(log_tree_root);
2375 return -EIO; 2378 return -EIO;
@@ -2377,7 +2380,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2377 /* returns with log_tree_root freed on success */ 2380 /* returns with log_tree_root freed on success */
2378 ret = btrfs_recover_log_trees(log_tree_root); 2381 ret = btrfs_recover_log_trees(log_tree_root);
2379 if (ret) { 2382 if (ret) {
2380 btrfs_error(tree_root->fs_info, ret, 2383 btrfs_std_error(tree_root->fs_info, ret,
2381 "Failed to recover log tree"); 2384 "Failed to recover log tree");
2382 free_extent_buffer(log_tree_root->node); 2385 free_extent_buffer(log_tree_root->node);
2383 kfree(log_tree_root); 2386 kfree(log_tree_root);
@@ -2653,8 +2656,8 @@ int open_ctree(struct super_block *sb,
2653 * Read super block and check the signature bytes only 2656 * Read super block and check the signature bytes only
2654 */ 2657 */
2655 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 2658 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
2656 if (!bh) { 2659 if (IS_ERR(bh)) {
2657 err = -EINVAL; 2660 err = PTR_ERR(bh);
2658 goto fail_alloc; 2661 goto fail_alloc;
2659 } 2662 }
2660 2663
@@ -2937,7 +2940,7 @@ retry_root_backup:
2937 goto fail_fsdev_sysfs; 2940 goto fail_fsdev_sysfs;
2938 } 2941 }
2939 2942
2940 ret = btrfs_sysfs_add_one(fs_info); 2943 ret = btrfs_sysfs_add_mounted(fs_info);
2941 if (ret) { 2944 if (ret) {
2942 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); 2945 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret);
2943 goto fail_fsdev_sysfs; 2946 goto fail_fsdev_sysfs;
@@ -3117,7 +3120,7 @@ fail_cleaner:
3117 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 3120 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3118 3121
3119fail_sysfs: 3122fail_sysfs:
3120 btrfs_sysfs_remove_one(fs_info); 3123 btrfs_sysfs_remove_mounted(fs_info);
3121 3124
3122fail_fsdev_sysfs: 3125fail_fsdev_sysfs:
3123 btrfs_sysfs_remove_fsid(fs_info->fs_devices); 3126 btrfs_sysfs_remove_fsid(fs_info->fs_devices);
@@ -3179,8 +3182,8 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
3179 struct btrfs_device *device = (struct btrfs_device *) 3182 struct btrfs_device *device = (struct btrfs_device *)
3180 bh->b_private; 3183 bh->b_private;
3181 3184
3182 printk_ratelimited_in_rcu(KERN_WARNING "BTRFS: lost page write due to " 3185 btrfs_warn_rl_in_rcu(device->dev_root->fs_info,
3183 "I/O error on %s\n", 3186 "lost page write due to IO error on %s",
3184 rcu_str_deref(device->name)); 3187 rcu_str_deref(device->name));
3185 /* note, we dont' set_buffer_write_io_error because we have 3188 /* note, we dont' set_buffer_write_io_error because we have
3186 * our own ways of dealing with the IO errors 3189 * our own ways of dealing with the IO errors
@@ -3192,6 +3195,37 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
3192 put_bh(bh); 3195 put_bh(bh);
3193} 3196}
3194 3197
3198int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
3199 struct buffer_head **bh_ret)
3200{
3201 struct buffer_head *bh;
3202 struct btrfs_super_block *super;
3203 u64 bytenr;
3204
3205 bytenr = btrfs_sb_offset(copy_num);
3206 if (bytenr + BTRFS_SUPER_INFO_SIZE >= i_size_read(bdev->bd_inode))
3207 return -EINVAL;
3208
3209 bh = __bread(bdev, bytenr / 4096, BTRFS_SUPER_INFO_SIZE);
3210 /*
3211 * If we fail to read from the underlying devices, as of now
3212 * the best option we have is to mark it EIO.
3213 */
3214 if (!bh)
3215 return -EIO;
3216
3217 super = (struct btrfs_super_block *)bh->b_data;
3218 if (btrfs_super_bytenr(super) != bytenr ||
3219 btrfs_super_magic(super) != BTRFS_MAGIC) {
3220 brelse(bh);
3221 return -EINVAL;
3222 }
3223
3224 *bh_ret = bh;
3225 return 0;
3226}
3227
3228
3195struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) 3229struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3196{ 3230{
3197 struct buffer_head *bh; 3231 struct buffer_head *bh;
@@ -3199,7 +3233,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3199 struct btrfs_super_block *super; 3233 struct btrfs_super_block *super;
3200 int i; 3234 int i;
3201 u64 transid = 0; 3235 u64 transid = 0;
3202 u64 bytenr; 3236 int ret = -EINVAL;
3203 3237
3204 /* we would like to check all the supers, but that would make 3238 /* we would like to check all the supers, but that would make
3205 * a btrfs mount succeed after a mkfs from a different FS. 3239 * a btrfs mount succeed after a mkfs from a different FS.
@@ -3207,21 +3241,11 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3207 * later supers, using BTRFS_SUPER_MIRROR_MAX instead 3241 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
3208 */ 3242 */
3209 for (i = 0; i < 1; i++) { 3243 for (i = 0; i < 1; i++) {
3210 bytenr = btrfs_sb_offset(i); 3244 ret = btrfs_read_dev_one_super(bdev, i, &bh);
3211 if (bytenr + BTRFS_SUPER_INFO_SIZE >= 3245 if (ret)
3212 i_size_read(bdev->bd_inode))
3213 break;
3214 bh = __bread(bdev, bytenr / 4096,
3215 BTRFS_SUPER_INFO_SIZE);
3216 if (!bh)
3217 continue; 3246 continue;
3218 3247
3219 super = (struct btrfs_super_block *)bh->b_data; 3248 super = (struct btrfs_super_block *)bh->b_data;
3220 if (btrfs_super_bytenr(super) != bytenr ||
3221 btrfs_super_magic(super) != BTRFS_MAGIC) {
3222 brelse(bh);
3223 continue;
3224 }
3225 3249
3226 if (!latest || btrfs_super_generation(super) > transid) { 3250 if (!latest || btrfs_super_generation(super) > transid) {
3227 brelse(latest); 3251 brelse(latest);
@@ -3231,6 +3255,10 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
3231 brelse(bh); 3255 brelse(bh);
3232 } 3256 }
3233 } 3257 }
3258
3259 if (!latest)
3260 return ERR_PTR(ret);
3261
3234 return latest; 3262 return latest;
3235} 3263}
3236 3264
@@ -3299,8 +3327,9 @@ static int write_dev_supers(struct btrfs_device *device,
3299 bh = __getblk(device->bdev, bytenr / 4096, 3327 bh = __getblk(device->bdev, bytenr / 4096,
3300 BTRFS_SUPER_INFO_SIZE); 3328 BTRFS_SUPER_INFO_SIZE);
3301 if (!bh) { 3329 if (!bh) {
3302 printk(KERN_ERR "BTRFS: couldn't get super " 3330 btrfs_err(device->dev_root->fs_info,
3303 "buffer head for bytenr %Lu\n", bytenr); 3331 "couldn't get super buffer head for bytenr %llu",
3332 bytenr);
3304 errors++; 3333 errors++;
3305 continue; 3334 continue;
3306 } 3335 }
@@ -3449,22 +3478,31 @@ static int barrier_all_devices(struct btrfs_fs_info *info)
3449 3478
3450int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) 3479int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
3451{ 3480{
3452 if ((flags & (BTRFS_BLOCK_GROUP_DUP | 3481 int raid_type;
3453 BTRFS_BLOCK_GROUP_RAID0 | 3482 int min_tolerated = INT_MAX;
3454 BTRFS_AVAIL_ALLOC_BIT_SINGLE)) ||
3455 ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0))
3456 return 0;
3457 3483
3458 if (flags & (BTRFS_BLOCK_GROUP_RAID1 | 3484 if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
3459 BTRFS_BLOCK_GROUP_RAID5 | 3485 (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3460 BTRFS_BLOCK_GROUP_RAID10)) 3486 min_tolerated = min(min_tolerated,
3461 return 1; 3487 btrfs_raid_array[BTRFS_RAID_SINGLE].
3488 tolerated_failures);
3462 3489
3463 if (flags & BTRFS_BLOCK_GROUP_RAID6) 3490 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3464 return 2; 3491 if (raid_type == BTRFS_RAID_SINGLE)
3492 continue;
3493 if (!(flags & btrfs_raid_group[raid_type]))
3494 continue;
3495 min_tolerated = min(min_tolerated,
3496 btrfs_raid_array[raid_type].
3497 tolerated_failures);
3498 }
3465 3499
3466 pr_warn("BTRFS: unknown raid type: %llu\n", flags); 3500 if (min_tolerated == INT_MAX) {
3467 return 0; 3501 pr_warn("BTRFS: unknown raid flag: %llu\n", flags);
3502 min_tolerated = 0;
3503 }
3504
3505 return min_tolerated;
3468} 3506}
3469 3507
3470int btrfs_calc_num_tolerated_disk_barrier_failures( 3508int btrfs_calc_num_tolerated_disk_barrier_failures(
@@ -3548,7 +3586,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3548 if (ret) { 3586 if (ret) {
3549 mutex_unlock( 3587 mutex_unlock(
3550 &root->fs_info->fs_devices->device_list_mutex); 3588 &root->fs_info->fs_devices->device_list_mutex);
3551 btrfs_error(root->fs_info, ret, 3589 btrfs_std_error(root->fs_info, ret,
3552 "errors while submitting device barriers."); 3590 "errors while submitting device barriers.");
3553 return ret; 3591 return ret;
3554 } 3592 }
@@ -3588,7 +3626,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3588 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3626 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3589 3627
3590 /* FUA is masked off if unsupported and can't be the reason */ 3628 /* FUA is masked off if unsupported and can't be the reason */
3591 btrfs_error(root->fs_info, -EIO, 3629 btrfs_std_error(root->fs_info, -EIO,
3592 "%d errors while writing supers", total_errors); 3630 "%d errors while writing supers", total_errors);
3593 return -EIO; 3631 return -EIO;
3594 } 3632 }
@@ -3606,7 +3644,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3606 } 3644 }
3607 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3645 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3608 if (total_errors > max_errors) { 3646 if (total_errors > max_errors) {
3609 btrfs_error(root->fs_info, -EIO, 3647 btrfs_std_error(root->fs_info, -EIO,
3610 "%d errors while writing supers", total_errors); 3648 "%d errors while writing supers", total_errors);
3611 return -EIO; 3649 return -EIO;
3612 } 3650 }
@@ -3792,7 +3830,7 @@ void close_ctree(struct btrfs_root *root)
3792 percpu_counter_sum(&fs_info->delalloc_bytes)); 3830 percpu_counter_sum(&fs_info->delalloc_bytes));
3793 } 3831 }
3794 3832
3795 btrfs_sysfs_remove_one(fs_info); 3833 btrfs_sysfs_remove_mounted(fs_info);
3796 btrfs_sysfs_remove_fsid(fs_info->fs_devices); 3834 btrfs_sysfs_remove_fsid(fs_info->fs_devices);
3797 3835
3798 btrfs_free_fs_roots(fs_info); 3836 btrfs_free_fs_roots(fs_info);
@@ -4290,25 +4328,6 @@ again:
4290 return 0; 4328 return 0;
4291} 4329}
4292 4330
4293static void btrfs_free_pending_ordered(struct btrfs_transaction *cur_trans,
4294 struct btrfs_fs_info *fs_info)
4295{
4296 struct btrfs_ordered_extent *ordered;
4297
4298 spin_lock(&fs_info->trans_lock);
4299 while (!list_empty(&cur_trans->pending_ordered)) {
4300 ordered = list_first_entry(&cur_trans->pending_ordered,
4301 struct btrfs_ordered_extent,
4302 trans_list);
4303 list_del_init(&ordered->trans_list);
4304 spin_unlock(&fs_info->trans_lock);
4305
4306 btrfs_put_ordered_extent(ordered);
4307 spin_lock(&fs_info->trans_lock);
4308 }
4309 spin_unlock(&fs_info->trans_lock);
4310}
4311
4312void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, 4331void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4313 struct btrfs_root *root) 4332 struct btrfs_root *root)
4314{ 4333{
@@ -4320,7 +4339,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4320 cur_trans->state = TRANS_STATE_UNBLOCKED; 4339 cur_trans->state = TRANS_STATE_UNBLOCKED;
4321 wake_up(&root->fs_info->transaction_wait); 4340 wake_up(&root->fs_info->transaction_wait);
4322 4341
4323 btrfs_free_pending_ordered(cur_trans, root->fs_info);
4324 btrfs_destroy_delayed_inodes(root); 4342 btrfs_destroy_delayed_inodes(root);
4325 btrfs_assert_delayed_root_empty(root); 4343 btrfs_assert_delayed_root_empty(root);
4326 4344
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bdfb479ea859..adeb31830b9c 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -60,6 +60,8 @@ void close_ctree(struct btrfs_root *root);
60int write_ctree_super(struct btrfs_trans_handle *trans, 60int write_ctree_super(struct btrfs_trans_handle *trans,
61 struct btrfs_root *root, int max_mirrors); 61 struct btrfs_root *root, int max_mirrors);
62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 62struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
63int btrfs_read_dev_one_super(struct block_device *bdev, int copy_num,
64 struct buffer_head **bh_ret);
63int btrfs_commit_super(struct btrfs_root *root); 65int btrfs_commit_super(struct btrfs_root *root);
64struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info, 66struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
65 u64 bytenr); 67 u64 bytenr);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 601d7d45d164..99a8e57da8a1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -95,8 +95,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
95 struct btrfs_root *root, 95 struct btrfs_root *root,
96 u64 parent, u64 root_objectid, 96 u64 parent, u64 root_objectid,
97 u64 flags, struct btrfs_disk_key *key, 97 u64 flags, struct btrfs_disk_key *key,
98 int level, struct btrfs_key *ins, 98 int level, struct btrfs_key *ins);
99 int no_quota);
100static int do_chunk_alloc(struct btrfs_trans_handle *trans, 99static int do_chunk_alloc(struct btrfs_trans_handle *trans,
101 struct btrfs_root *extent_root, u64 flags, 100 struct btrfs_root *extent_root, u64 flags,
102 int force); 101 int force);
@@ -332,6 +331,27 @@ static void put_caching_control(struct btrfs_caching_control *ctl)
332 kfree(ctl); 331 kfree(ctl);
333} 332}
334 333
334#ifdef CONFIG_BTRFS_DEBUG
335static void fragment_free_space(struct btrfs_root *root,
336 struct btrfs_block_group_cache *block_group)
337{
338 u64 start = block_group->key.objectid;
339 u64 len = block_group->key.offset;
340 u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
341 root->nodesize : root->sectorsize;
342 u64 step = chunk << 1;
343
344 while (len > chunk) {
345 btrfs_remove_free_space(block_group, start, chunk);
346 start += step;
347 if (len < step)
348 len = 0;
349 else
350 len -= step;
351 }
352}
353#endif
354
335/* 355/*
336 * this is only called by cache_block_group, since we could have freed extents 356 * this is only called by cache_block_group, since we could have freed extents
337 * we need to check the pinned_extents for any extents that can't be used yet 357 * we need to check the pinned_extents for any extents that can't be used yet
@@ -388,6 +408,7 @@ static noinline void caching_thread(struct btrfs_work *work)
388 u64 last = 0; 408 u64 last = 0;
389 u32 nritems; 409 u32 nritems;
390 int ret = -ENOMEM; 410 int ret = -ENOMEM;
411 bool wakeup = true;
391 412
392 caching_ctl = container_of(work, struct btrfs_caching_control, work); 413 caching_ctl = container_of(work, struct btrfs_caching_control, work);
393 block_group = caching_ctl->block_group; 414 block_group = caching_ctl->block_group;
@@ -400,6 +421,15 @@ static noinline void caching_thread(struct btrfs_work *work)
400 421
401 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 422 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
402 423
424#ifdef CONFIG_BTRFS_DEBUG
425 /*
426 * If we're fragmenting we don't want to make anybody think we can
427 * allocate from this block group until we've had a chance to fragment
428 * the free space.
429 */
430 if (btrfs_should_fragment_free_space(extent_root, block_group))
431 wakeup = false;
432#endif
403 /* 433 /*
404 * We don't want to deadlock with somebody trying to allocate a new 434 * We don't want to deadlock with somebody trying to allocate a new
405 * extent for the extent root while also trying to search the extent 435 * extent for the extent root while also trying to search the extent
@@ -441,7 +471,8 @@ next:
441 471
442 if (need_resched() || 472 if (need_resched() ||
443 rwsem_is_contended(&fs_info->commit_root_sem)) { 473 rwsem_is_contended(&fs_info->commit_root_sem)) {
444 caching_ctl->progress = last; 474 if (wakeup)
475 caching_ctl->progress = last;
445 btrfs_release_path(path); 476 btrfs_release_path(path);
446 up_read(&fs_info->commit_root_sem); 477 up_read(&fs_info->commit_root_sem);
447 mutex_unlock(&caching_ctl->mutex); 478 mutex_unlock(&caching_ctl->mutex);
@@ -464,7 +495,8 @@ next:
464 key.offset = 0; 495 key.offset = 0;
465 key.type = BTRFS_EXTENT_ITEM_KEY; 496 key.type = BTRFS_EXTENT_ITEM_KEY;
466 497
467 caching_ctl->progress = last; 498 if (wakeup)
499 caching_ctl->progress = last;
468 btrfs_release_path(path); 500 btrfs_release_path(path);
469 goto next; 501 goto next;
470 } 502 }
@@ -491,7 +523,8 @@ next:
491 523
492 if (total_found > (1024 * 1024 * 2)) { 524 if (total_found > (1024 * 1024 * 2)) {
493 total_found = 0; 525 total_found = 0;
494 wake_up(&caching_ctl->wait); 526 if (wakeup)
527 wake_up(&caching_ctl->wait);
495 } 528 }
496 } 529 }
497 path->slots[0]++; 530 path->slots[0]++;
@@ -501,13 +534,27 @@ next:
501 total_found += add_new_free_space(block_group, fs_info, last, 534 total_found += add_new_free_space(block_group, fs_info, last,
502 block_group->key.objectid + 535 block_group->key.objectid +
503 block_group->key.offset); 536 block_group->key.offset);
504 caching_ctl->progress = (u64)-1;
505
506 spin_lock(&block_group->lock); 537 spin_lock(&block_group->lock);
507 block_group->caching_ctl = NULL; 538 block_group->caching_ctl = NULL;
508 block_group->cached = BTRFS_CACHE_FINISHED; 539 block_group->cached = BTRFS_CACHE_FINISHED;
509 spin_unlock(&block_group->lock); 540 spin_unlock(&block_group->lock);
510 541
542#ifdef CONFIG_BTRFS_DEBUG
543 if (btrfs_should_fragment_free_space(extent_root, block_group)) {
544 u64 bytes_used;
545
546 spin_lock(&block_group->space_info->lock);
547 spin_lock(&block_group->lock);
548 bytes_used = block_group->key.offset -
549 btrfs_block_group_used(&block_group->item);
550 block_group->space_info->bytes_used += bytes_used >> 1;
551 spin_unlock(&block_group->lock);
552 spin_unlock(&block_group->space_info->lock);
553 fragment_free_space(extent_root, block_group);
554 }
555#endif
556
557 caching_ctl->progress = (u64)-1;
511err: 558err:
512 btrfs_free_path(path); 559 btrfs_free_path(path);
513 up_read(&fs_info->commit_root_sem); 560 up_read(&fs_info->commit_root_sem);
@@ -607,6 +654,22 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
607 } 654 }
608 } 655 }
609 spin_unlock(&cache->lock); 656 spin_unlock(&cache->lock);
657#ifdef CONFIG_BTRFS_DEBUG
658 if (ret == 1 &&
659 btrfs_should_fragment_free_space(fs_info->extent_root,
660 cache)) {
661 u64 bytes_used;
662
663 spin_lock(&cache->space_info->lock);
664 spin_lock(&cache->lock);
665 bytes_used = cache->key.offset -
666 btrfs_block_group_used(&cache->item);
667 cache->space_info->bytes_used += bytes_used >> 1;
668 spin_unlock(&cache->lock);
669 spin_unlock(&cache->space_info->lock);
670 fragment_free_space(fs_info->extent_root, cache);
671 }
672#endif
610 mutex_unlock(&caching_ctl->mutex); 673 mutex_unlock(&caching_ctl->mutex);
611 674
612 wake_up(&caching_ctl->wait); 675 wake_up(&caching_ctl->wait);
@@ -2009,8 +2072,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
2009int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, 2072int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2010 struct btrfs_root *root, 2073 struct btrfs_root *root,
2011 u64 bytenr, u64 num_bytes, u64 parent, 2074 u64 bytenr, u64 num_bytes, u64 parent,
2012 u64 root_objectid, u64 owner, u64 offset, 2075 u64 root_objectid, u64 owner, u64 offset)
2013 int no_quota)
2014{ 2076{
2015 int ret; 2077 int ret;
2016 struct btrfs_fs_info *fs_info = root->fs_info; 2078 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -2022,12 +2084,12 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2022 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 2084 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
2023 num_bytes, 2085 num_bytes,
2024 parent, root_objectid, (int)owner, 2086 parent, root_objectid, (int)owner,
2025 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 2087 BTRFS_ADD_DELAYED_REF, NULL);
2026 } else { 2088 } else {
2027 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 2089 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
2028 num_bytes, 2090 num_bytes, parent, root_objectid,
2029 parent, root_objectid, owner, offset, 2091 owner, offset, 0,
2030 BTRFS_ADD_DELAYED_REF, NULL, no_quota); 2092 BTRFS_ADD_DELAYED_REF, NULL);
2031 } 2093 }
2032 return ret; 2094 return ret;
2033} 2095}
@@ -2048,15 +2110,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
2048 u64 num_bytes = node->num_bytes; 2110 u64 num_bytes = node->num_bytes;
2049 u64 refs; 2111 u64 refs;
2050 int ret; 2112 int ret;
2051 int no_quota = node->no_quota;
2052 2113
2053 path = btrfs_alloc_path(); 2114 path = btrfs_alloc_path();
2054 if (!path) 2115 if (!path)
2055 return -ENOMEM; 2116 return -ENOMEM;
2056 2117
2057 if (!is_fstree(root_objectid) || !root->fs_info->quota_enabled)
2058 no_quota = 1;
2059
2060 path->reada = 1; 2118 path->reada = 1;
2061 path->leave_spinning = 1; 2119 path->leave_spinning = 1;
2062 /* this will setup the path even if it fails to insert the back ref */ 2120 /* this will setup the path even if it fails to insert the back ref */
@@ -2291,8 +2349,7 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2291 parent, ref_root, 2349 parent, ref_root,
2292 extent_op->flags_to_set, 2350 extent_op->flags_to_set,
2293 &extent_op->key, 2351 &extent_op->key,
2294 ref->level, &ins, 2352 ref->level, &ins);
2295 node->no_quota);
2296 } else if (node->action == BTRFS_ADD_DELAYED_REF) { 2353 } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2297 ret = __btrfs_inc_extent_ref(trans, root, node, 2354 ret = __btrfs_inc_extent_ref(trans, root, node,
2298 parent, ref_root, 2355 parent, ref_root,
@@ -2345,6 +2402,11 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2345 node->num_bytes); 2402 node->num_bytes);
2346 } 2403 }
2347 } 2404 }
2405
2406 /* Also free its reserved qgroup space */
2407 btrfs_qgroup_free_delayed_ref(root->fs_info,
2408 head->qgroup_ref_root,
2409 head->qgroup_reserved);
2348 return ret; 2410 return ret;
2349 } 2411 }
2350 2412
@@ -2433,7 +2495,21 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2433 } 2495 }
2434 } 2496 }
2435 2497
2498 /*
2499 * We need to try and merge add/drops of the same ref since we
2500 * can run into issues with relocate dropping the implicit ref
2501 * and then it being added back again before the drop can
2502 * finish. If we merged anything we need to re-loop so we can
2503 * get a good ref.
2504 * Or we can get node references of the same type that weren't
2505 * merged when created due to bumps in the tree mod seq, and
2506 * we need to merge them to prevent adding an inline extent
2507 * backref before dropping it (triggering a BUG_ON at
2508 * insert_inline_extent_backref()).
2509 */
2436 spin_lock(&locked_ref->lock); 2510 spin_lock(&locked_ref->lock);
2511 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2512 locked_ref);
2437 2513
2438 /* 2514 /*
2439 * locked_ref is the head node, so we have to go one 2515 * locked_ref is the head node, so we have to go one
@@ -3109,7 +3185,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3109 int level; 3185 int level;
3110 int ret = 0; 3186 int ret = 0;
3111 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, 3187 int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
3112 u64, u64, u64, u64, u64, u64, int); 3188 u64, u64, u64, u64, u64, u64);
3113 3189
3114 3190
3115 if (btrfs_test_is_dummy_root(root)) 3191 if (btrfs_test_is_dummy_root(root))
@@ -3150,15 +3226,14 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
3150 key.offset -= btrfs_file_extent_offset(buf, fi); 3226 key.offset -= btrfs_file_extent_offset(buf, fi);
3151 ret = process_func(trans, root, bytenr, num_bytes, 3227 ret = process_func(trans, root, bytenr, num_bytes,
3152 parent, ref_root, key.objectid, 3228 parent, ref_root, key.objectid,
3153 key.offset, 1); 3229 key.offset);
3154 if (ret) 3230 if (ret)
3155 goto fail; 3231 goto fail;
3156 } else { 3232 } else {
3157 bytenr = btrfs_node_blockptr(buf, i); 3233 bytenr = btrfs_node_blockptr(buf, i);
3158 num_bytes = root->nodesize; 3234 num_bytes = root->nodesize;
3159 ret = process_func(trans, root, bytenr, num_bytes, 3235 ret = process_func(trans, root, bytenr, num_bytes,
3160 parent, ref_root, level - 1, 0, 3236 parent, ref_root, level - 1, 0);
3161 1);
3162 if (ret) 3237 if (ret)
3163 goto fail; 3238 goto fail;
3164 } 3239 }
@@ -3339,6 +3414,15 @@ again:
3339 spin_unlock(&block_group->lock); 3414 spin_unlock(&block_group->lock);
3340 3415
3341 /* 3416 /*
3417 * We hit an ENOSPC when setting up the cache in this transaction, just
3418 * skip doing the setup, we've already cleared the cache so we're safe.
3419 */
3420 if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
3421 ret = -ENOSPC;
3422 goto out_put;
3423 }
3424
3425 /*
3342 * Try to preallocate enough space based on how big the block group is. 3426 * Try to preallocate enough space based on how big the block group is.
3343 * Keep in mind this has to include any pinned space which could end up 3427 * Keep in mind this has to include any pinned space which could end up
3344 * taking up quite a bit since it's not folded into the other space 3428 * taking up quite a bit since it's not folded into the other space
@@ -3351,16 +3435,26 @@ again:
3351 num_pages *= 16; 3435 num_pages *= 16;
3352 num_pages *= PAGE_CACHE_SIZE; 3436 num_pages *= PAGE_CACHE_SIZE;
3353 3437
3354 ret = btrfs_check_data_free_space(inode, num_pages, num_pages); 3438 ret = btrfs_check_data_free_space(inode, 0, num_pages);
3355 if (ret) 3439 if (ret)
3356 goto out_put; 3440 goto out_put;
3357 3441
3358 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages, 3442 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3359 num_pages, num_pages, 3443 num_pages, num_pages,
3360 &alloc_hint); 3444 &alloc_hint);
3445 /*
3446 * Our cache requires contiguous chunks so that we don't modify a bunch
3447 * of metadata or split extents when writing the cache out, which means
3448 * we can enospc if we are heavily fragmented in addition to just normal
3449 * out of space conditions. So if we hit this just skip setting up any
3450 * other block groups for this transaction, maybe we'll unpin enough
3451 * space the next time around.
3452 */
3361 if (!ret) 3453 if (!ret)
3362 dcs = BTRFS_DC_SETUP; 3454 dcs = BTRFS_DC_SETUP;
3363 btrfs_free_reserved_data_space(inode, num_pages); 3455 else if (ret == -ENOSPC)
3456 set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
3457 btrfs_free_reserved_data_space(inode, 0, num_pages);
3364 3458
3365out_put: 3459out_put:
3366 iput(inode); 3460 iput(inode);
@@ -3746,6 +3840,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3746 found->bytes_readonly = 0; 3840 found->bytes_readonly = 0;
3747 found->bytes_may_use = 0; 3841 found->bytes_may_use = 0;
3748 found->full = 0; 3842 found->full = 0;
3843 found->max_extent_size = 0;
3749 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 3844 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3750 found->chunk_alloc = 0; 3845 found->chunk_alloc = 0;
3751 found->flush = 0; 3846 found->flush = 0;
@@ -3822,7 +3917,8 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3822{ 3917{
3823 u64 num_devices = root->fs_info->fs_devices->rw_devices; 3918 u64 num_devices = root->fs_info->fs_devices->rw_devices;
3824 u64 target; 3919 u64 target;
3825 u64 tmp; 3920 u64 raid_type;
3921 u64 allowed = 0;
3826 3922
3827 /* 3923 /*
3828 * see if restripe for this chunk_type is in progress, if so 3924 * see if restripe for this chunk_type is in progress, if so
@@ -3840,31 +3936,26 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3840 spin_unlock(&root->fs_info->balance_lock); 3936 spin_unlock(&root->fs_info->balance_lock);
3841 3937
3842 /* First, mask out the RAID levels which aren't possible */ 3938 /* First, mask out the RAID levels which aren't possible */
3843 if (num_devices == 1) 3939 for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
3844 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 | 3940 if (num_devices >= btrfs_raid_array[raid_type].devs_min)
3845 BTRFS_BLOCK_GROUP_RAID5); 3941 allowed |= btrfs_raid_group[raid_type];
3846 if (num_devices < 3) 3942 }
3847 flags &= ~BTRFS_BLOCK_GROUP_RAID6; 3943 allowed &= flags;
3848 if (num_devices < 4) 3944
3849 flags &= ~BTRFS_BLOCK_GROUP_RAID10; 3945 if (allowed & BTRFS_BLOCK_GROUP_RAID6)
3850 3946 allowed = BTRFS_BLOCK_GROUP_RAID6;
3851 tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 | 3947 else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
3852 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 | 3948 allowed = BTRFS_BLOCK_GROUP_RAID5;
3853 BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10); 3949 else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
3854 flags &= ~tmp; 3950 allowed = BTRFS_BLOCK_GROUP_RAID10;
3855 3951 else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
3856 if (tmp & BTRFS_BLOCK_GROUP_RAID6) 3952 allowed = BTRFS_BLOCK_GROUP_RAID1;
3857 tmp = BTRFS_BLOCK_GROUP_RAID6; 3953 else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
3858 else if (tmp & BTRFS_BLOCK_GROUP_RAID5) 3954 allowed = BTRFS_BLOCK_GROUP_RAID0;
3859 tmp = BTRFS_BLOCK_GROUP_RAID5; 3955
3860 else if (tmp & BTRFS_BLOCK_GROUP_RAID10) 3956 flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
3861 tmp = BTRFS_BLOCK_GROUP_RAID10; 3957
3862 else if (tmp & BTRFS_BLOCK_GROUP_RAID1) 3958 return extended_to_chunk(flags | allowed);
3863 tmp = BTRFS_BLOCK_GROUP_RAID1;
3864 else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3865 tmp = BTRFS_BLOCK_GROUP_RAID0;
3866
3867 return extended_to_chunk(flags | tmp);
3868} 3959}
3869 3960
3870static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags) 3961static u64 get_alloc_profile(struct btrfs_root *root, u64 orig_flags)
@@ -3903,11 +3994,7 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3903 return ret; 3994 return ret;
3904} 3995}
3905 3996
3906/* 3997int btrfs_alloc_data_chunk_ondemand(struct inode *inode, u64 bytes)
3907 * This will check the space that the inode allocates from to make sure we have
3908 * enough space for bytes.
3909 */
3910int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes)
3911{ 3998{
3912 struct btrfs_space_info *data_sinfo; 3999 struct btrfs_space_info *data_sinfo;
3913 struct btrfs_root *root = BTRFS_I(inode)->root; 4000 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4006,7 +4093,8 @@ commit_trans:
4006 if (IS_ERR(trans)) 4093 if (IS_ERR(trans))
4007 return PTR_ERR(trans); 4094 return PTR_ERR(trans);
4008 if (have_pinned_space >= 0 || 4095 if (have_pinned_space >= 0 ||
4009 trans->transaction->have_free_bgs || 4096 test_bit(BTRFS_TRANS_HAVE_FREE_BGS,
4097 &trans->transaction->flags) ||
4010 need_commit > 0) { 4098 need_commit > 0) {
4011 ret = btrfs_commit_transaction(trans, root); 4099 ret = btrfs_commit_transaction(trans, root);
4012 if (ret) 4100 if (ret)
@@ -4028,38 +4116,86 @@ commit_trans:
4028 data_sinfo->flags, bytes, 1); 4116 data_sinfo->flags, bytes, 1);
4029 return -ENOSPC; 4117 return -ENOSPC;
4030 } 4118 }
4031 ret = btrfs_qgroup_reserve(root, write_bytes);
4032 if (ret)
4033 goto out;
4034 data_sinfo->bytes_may_use += bytes; 4119 data_sinfo->bytes_may_use += bytes;
4035 trace_btrfs_space_reservation(root->fs_info, "space_info", 4120 trace_btrfs_space_reservation(root->fs_info, "space_info",
4036 data_sinfo->flags, bytes, 1); 4121 data_sinfo->flags, bytes, 1);
4037out:
4038 spin_unlock(&data_sinfo->lock); 4122 spin_unlock(&data_sinfo->lock);
4039 4123
4040 return ret; 4124 return ret;
4041} 4125}
4042 4126
4043/* 4127/*
4044 * Called if we need to clear a data reservation for this inode. 4128 * New check_data_free_space() with ability for precious data reservation
4129 * Will replace old btrfs_check_data_free_space(), but for patch split,
4130 * add a new function first and then replace it.
4131 */
4132int btrfs_check_data_free_space(struct inode *inode, u64 start, u64 len)
4133{
4134 struct btrfs_root *root = BTRFS_I(inode)->root;
4135 int ret;
4136
4137 /* align the range */
4138 len = round_up(start + len, root->sectorsize) -
4139 round_down(start, root->sectorsize);
4140 start = round_down(start, root->sectorsize);
4141
4142 ret = btrfs_alloc_data_chunk_ondemand(inode, len);
4143 if (ret < 0)
4144 return ret;
4145
4146 /*
4147 * Use new btrfs_qgroup_reserve_data to reserve precious data space
4148 *
4149 * TODO: Find a good method to avoid reserve data space for NOCOW
4150 * range, but don't impact performance on quota disable case.
4151 */
4152 ret = btrfs_qgroup_reserve_data(inode, start, len);
4153 return ret;
4154}
4155
4156/*
4157 * Called if we need to clear a data reservation for this inode
4158 * Normally in a error case.
4159 *
4160 * This one will *NOT* use accurate qgroup reserved space API, just for case
4161 * which we can't sleep and is sure it won't affect qgroup reserved space.
4162 * Like clear_bit_hook().
4045 */ 4163 */
4046void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 4164void btrfs_free_reserved_data_space_noquota(struct inode *inode, u64 start,
4165 u64 len)
4047{ 4166{
4048 struct btrfs_root *root = BTRFS_I(inode)->root; 4167 struct btrfs_root *root = BTRFS_I(inode)->root;
4049 struct btrfs_space_info *data_sinfo; 4168 struct btrfs_space_info *data_sinfo;
4050 4169
4051 /* make sure bytes are sectorsize aligned */ 4170 /* Make sure the range is aligned to sectorsize */
4052 bytes = ALIGN(bytes, root->sectorsize); 4171 len = round_up(start + len, root->sectorsize) -
4172 round_down(start, root->sectorsize);
4173 start = round_down(start, root->sectorsize);
4053 4174
4054 data_sinfo = root->fs_info->data_sinfo; 4175 data_sinfo = root->fs_info->data_sinfo;
4055 spin_lock(&data_sinfo->lock); 4176 spin_lock(&data_sinfo->lock);
4056 WARN_ON(data_sinfo->bytes_may_use < bytes); 4177 if (WARN_ON(data_sinfo->bytes_may_use < len))
4057 data_sinfo->bytes_may_use -= bytes; 4178 data_sinfo->bytes_may_use = 0;
4179 else
4180 data_sinfo->bytes_may_use -= len;
4058 trace_btrfs_space_reservation(root->fs_info, "space_info", 4181 trace_btrfs_space_reservation(root->fs_info, "space_info",
4059 data_sinfo->flags, bytes, 0); 4182 data_sinfo->flags, len, 0);
4060 spin_unlock(&data_sinfo->lock); 4183 spin_unlock(&data_sinfo->lock);
4061} 4184}
4062 4185
4186/*
4187 * Called if we need to clear a data reservation for this inode
4188 * Normally in a error case.
4189 *
4190 * This one will handle the per-indoe data rsv map for accurate reserved
4191 * space framework.
4192 */
4193void btrfs_free_reserved_data_space(struct inode *inode, u64 start, u64 len)
4194{
4195 btrfs_free_reserved_data_space_noquota(inode, start, len);
4196 btrfs_qgroup_free_data(inode, start, len);
4197}
4198
4063static void force_metadata_allocation(struct btrfs_fs_info *info) 4199static void force_metadata_allocation(struct btrfs_fs_info *info)
4064{ 4200{
4065 struct list_head *head = &info->space_info; 4201 struct list_head *head = &info->space_info;
@@ -4891,13 +5027,9 @@ static struct btrfs_block_rsv *get_block_rsv(
4891{ 5027{
4892 struct btrfs_block_rsv *block_rsv = NULL; 5028 struct btrfs_block_rsv *block_rsv = NULL;
4893 5029
4894 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) 5030 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4895 block_rsv = trans->block_rsv; 5031 (root == root->fs_info->csum_root && trans->adding_csums) ||
4896 5032 (root == root->fs_info->uuid_root))
4897 if (root == root->fs_info->csum_root && trans->adding_csums)
4898 block_rsv = trans->block_rsv;
4899
4900 if (root == root->fs_info->uuid_root)
4901 block_rsv = trans->block_rsv; 5033 block_rsv = trans->block_rsv;
4902 5034
4903 if (!block_rsv) 5035 if (!block_rsv)
@@ -5340,7 +5472,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5340 if (root->fs_info->quota_enabled) { 5472 if (root->fs_info->quota_enabled) {
5341 /* One for parent inode, two for dir entries */ 5473 /* One for parent inode, two for dir entries */
5342 num_bytes = 3 * root->nodesize; 5474 num_bytes = 3 * root->nodesize;
5343 ret = btrfs_qgroup_reserve(root, num_bytes); 5475 ret = btrfs_qgroup_reserve_meta(root, num_bytes);
5344 if (ret) 5476 if (ret)
5345 return ret; 5477 return ret;
5346 } else { 5478 } else {
@@ -5358,10 +5490,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
5358 if (ret == -ENOSPC && use_global_rsv) 5490 if (ret == -ENOSPC && use_global_rsv)
5359 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes); 5491 ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
5360 5492
5361 if (ret) { 5493 if (ret && *qgroup_reserved)
5362 if (*qgroup_reserved) 5494 btrfs_qgroup_free_meta(root, *qgroup_reserved);
5363 btrfs_qgroup_free(root, *qgroup_reserved);
5364 }
5365 5495
5366 return ret; 5496 return ret;
5367} 5497}
@@ -5522,15 +5652,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
5522 spin_unlock(&BTRFS_I(inode)->lock); 5652 spin_unlock(&BTRFS_I(inode)->lock);
5523 5653
5524 if (root->fs_info->quota_enabled) { 5654 if (root->fs_info->quota_enabled) {
5525 ret = btrfs_qgroup_reserve(root, nr_extents * root->nodesize); 5655 ret = btrfs_qgroup_reserve_meta(root,
5656 nr_extents * root->nodesize);
5526 if (ret) 5657 if (ret)
5527 goto out_fail; 5658 goto out_fail;
5528 } 5659 }
5529 5660
5530 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush); 5661 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
5531 if (unlikely(ret)) { 5662 if (unlikely(ret)) {
5532 if (root->fs_info->quota_enabled) 5663 btrfs_qgroup_free_meta(root, nr_extents * root->nodesize);
5533 btrfs_qgroup_free(root, nr_extents * root->nodesize);
5534 goto out_fail; 5664 goto out_fail;
5535 } 5665 }
5536 5666
@@ -5653,41 +5783,48 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
5653} 5783}
5654 5784
5655/** 5785/**
5656 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc 5786 * btrfs_delalloc_reserve_space - reserve data and metadata space for
5787 * delalloc
5657 * @inode: inode we're writing to 5788 * @inode: inode we're writing to
5658 * @num_bytes: the number of bytes we want to allocate 5789 * @start: start range we are writing to
5790 * @len: how long the range we are writing to
5791 *
5792 * TODO: This function will finally replace old btrfs_delalloc_reserve_space()
5659 * 5793 *
5660 * This will do the following things 5794 * This will do the following things
5661 * 5795 *
5662 * o reserve space in the data space info for num_bytes 5796 * o reserve space in data space info for num bytes
5663 * o reserve space in the metadata space info based on number of outstanding 5797 * and reserve precious corresponding qgroup space
5798 * (Done in check_data_free_space)
5799 *
5800 * o reserve space for metadata space, based on the number of outstanding
5664 * extents and how much csums will be needed 5801 * extents and how much csums will be needed
5665 * o add to the inodes ->delalloc_bytes 5802 * also reserve metadata space in a per root over-reserve method.
5803 * o add to the inodes->delalloc_bytes
5666 * o add it to the fs_info's delalloc inodes list. 5804 * o add it to the fs_info's delalloc inodes list.
5805 * (Above 3 all done in delalloc_reserve_metadata)
5667 * 5806 *
5668 * This will return 0 for success and -ENOSPC if there is no space left. 5807 * Return 0 for success
5808 * Return <0 for error(-ENOSPC or -EQUOT)
5669 */ 5809 */
5670int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 5810int btrfs_delalloc_reserve_space(struct inode *inode, u64 start, u64 len)
5671{ 5811{
5672 int ret; 5812 int ret;
5673 5813
5674 ret = btrfs_check_data_free_space(inode, num_bytes, num_bytes); 5814 ret = btrfs_check_data_free_space(inode, start, len);
5675 if (ret) 5815 if (ret < 0)
5676 return ret;
5677
5678 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5679 if (ret) {
5680 btrfs_free_reserved_data_space(inode, num_bytes);
5681 return ret; 5816 return ret;
5682 } 5817 ret = btrfs_delalloc_reserve_metadata(inode, len);
5683 5818 if (ret < 0)
5684 return 0; 5819 btrfs_free_reserved_data_space(inode, start, len);
5820 return ret;
5685} 5821}
5686 5822
5687/** 5823/**
5688 * btrfs_delalloc_release_space - release data and metadata space for delalloc 5824 * btrfs_delalloc_release_space - release data and metadata space for delalloc
5689 * @inode: inode we're releasing space for 5825 * @inode: inode we're releasing space for
5690 * @num_bytes: the number of bytes we want to free up 5826 * @start: start position of the space already reserved
5827 * @len: the len of the space already reserved
5691 * 5828 *
5692 * This must be matched with a call to btrfs_delalloc_reserve_space. This is 5829 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
5693 * called in the case that we don't need the metadata AND data reservations 5830 * called in the case that we don't need the metadata AND data reservations
@@ -5696,11 +5833,12 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5696 * This function will release the metadata space that was not used and will 5833 * This function will release the metadata space that was not used and will
5697 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes 5834 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5698 * list if there are no delalloc bytes left. 5835 * list if there are no delalloc bytes left.
5836 * Also it will handle the qgroup reserved space.
5699 */ 5837 */
5700void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 5838void btrfs_delalloc_release_space(struct inode *inode, u64 start, u64 len)
5701{ 5839{
5702 btrfs_delalloc_release_metadata(inode, num_bytes); 5840 btrfs_delalloc_release_metadata(inode, len);
5703 btrfs_free_reserved_data_space(inode, num_bytes); 5841 btrfs_free_reserved_data_space(inode, start, len);
5704} 5842}
5705 5843
5706static int update_block_group(struct btrfs_trans_handle *trans, 5844static int update_block_group(struct btrfs_trans_handle *trans,
@@ -6065,6 +6203,34 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
6065 update_global_block_rsv(fs_info); 6203 update_global_block_rsv(fs_info);
6066} 6204}
6067 6205
6206/*
6207 * Returns the free cluster for the given space info and sets empty_cluster to
6208 * what it should be based on the mount options.
6209 */
6210static struct btrfs_free_cluster *
6211fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
6212 u64 *empty_cluster)
6213{
6214 struct btrfs_free_cluster *ret = NULL;
6215 bool ssd = btrfs_test_opt(root, SSD);
6216
6217 *empty_cluster = 0;
6218 if (btrfs_mixed_space_info(space_info))
6219 return ret;
6220
6221 if (ssd)
6222 *empty_cluster = 2 * 1024 * 1024;
6223 if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
6224 ret = &root->fs_info->meta_alloc_cluster;
6225 if (!ssd)
6226 *empty_cluster = 64 * 1024;
6227 } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
6228 ret = &root->fs_info->data_alloc_cluster;
6229 }
6230
6231 return ret;
6232}
6233
6068static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, 6234static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6069 const bool return_free_space) 6235 const bool return_free_space)
6070{ 6236{
@@ -6072,7 +6238,10 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6072 struct btrfs_block_group_cache *cache = NULL; 6238 struct btrfs_block_group_cache *cache = NULL;
6073 struct btrfs_space_info *space_info; 6239 struct btrfs_space_info *space_info;
6074 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; 6240 struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
6241 struct btrfs_free_cluster *cluster = NULL;
6075 u64 len; 6242 u64 len;
6243 u64 total_unpinned = 0;
6244 u64 empty_cluster = 0;
6076 bool readonly; 6245 bool readonly;
6077 6246
6078 while (start <= end) { 6247 while (start <= end) {
@@ -6081,8 +6250,14 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6081 start >= cache->key.objectid + cache->key.offset) { 6250 start >= cache->key.objectid + cache->key.offset) {
6082 if (cache) 6251 if (cache)
6083 btrfs_put_block_group(cache); 6252 btrfs_put_block_group(cache);
6253 total_unpinned = 0;
6084 cache = btrfs_lookup_block_group(fs_info, start); 6254 cache = btrfs_lookup_block_group(fs_info, start);
6085 BUG_ON(!cache); /* Logic error */ 6255 BUG_ON(!cache); /* Logic error */
6256
6257 cluster = fetch_cluster_info(root,
6258 cache->space_info,
6259 &empty_cluster);
6260 empty_cluster <<= 1;
6086 } 6261 }
6087 6262
6088 len = cache->key.objectid + cache->key.offset - start; 6263 len = cache->key.objectid + cache->key.offset - start;
@@ -6095,12 +6270,27 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end,
6095 } 6270 }
6096 6271
6097 start += len; 6272 start += len;
6273 total_unpinned += len;
6098 space_info = cache->space_info; 6274 space_info = cache->space_info;
6099 6275
6276 /*
6277 * If this space cluster has been marked as fragmented and we've
6278 * unpinned enough in this block group to potentially allow a
6279 * cluster to be created inside of it go ahead and clear the
6280 * fragmented check.
6281 */
6282 if (cluster && cluster->fragmented &&
6283 total_unpinned > empty_cluster) {
6284 spin_lock(&cluster->lock);
6285 cluster->fragmented = 0;
6286 spin_unlock(&cluster->lock);
6287 }
6288
6100 spin_lock(&space_info->lock); 6289 spin_lock(&space_info->lock);
6101 spin_lock(&cache->lock); 6290 spin_lock(&cache->lock);
6102 cache->pinned -= len; 6291 cache->pinned -= len;
6103 space_info->bytes_pinned -= len; 6292 space_info->bytes_pinned -= len;
6293 space_info->max_extent_size = 0;
6104 percpu_counter_add(&space_info->total_bytes_pinned, -len); 6294 percpu_counter_add(&space_info->total_bytes_pinned, -len);
6105 if (cache->ro) { 6295 if (cache->ro) {
6106 space_info->bytes_readonly += len; 6296 space_info->bytes_readonly += len;
@@ -6233,7 +6423,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6233 int extent_slot = 0; 6423 int extent_slot = 0;
6234 int found_extent = 0; 6424 int found_extent = 0;
6235 int num_to_del = 1; 6425 int num_to_del = 1;
6236 int no_quota = node->no_quota;
6237 u32 item_size; 6426 u32 item_size;
6238 u64 refs; 6427 u64 refs;
6239 u64 bytenr = node->bytenr; 6428 u64 bytenr = node->bytenr;
@@ -6242,9 +6431,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
6242 bool skinny_metadata = btrfs_fs_incompat(root->fs_info, 6431 bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6243 SKINNY_METADATA); 6432 SKINNY_METADATA);
6244 6433
6245 if (!info->quota_enabled || !is_fstree(root_objectid))
6246 no_quota = 1;
6247
6248 path = btrfs_alloc_path(); 6434 path = btrfs_alloc_path();
6249 if (!path) 6435 if (!path)
6250 return -ENOMEM; 6436 return -ENOMEM;
@@ -6570,7 +6756,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
6570 buf->start, buf->len, 6756 buf->start, buf->len,
6571 parent, root->root_key.objectid, 6757 parent, root->root_key.objectid,
6572 btrfs_header_level(buf), 6758 btrfs_header_level(buf),
6573 BTRFS_DROP_DELAYED_REF, NULL, 0); 6759 BTRFS_DROP_DELAYED_REF, NULL);
6574 BUG_ON(ret); /* -ENOMEM */ 6760 BUG_ON(ret); /* -ENOMEM */
6575 } 6761 }
6576 6762
@@ -6618,7 +6804,7 @@ out:
6618/* Can return -ENOMEM */ 6804/* Can return -ENOMEM */
6619int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, 6805int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6620 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid, 6806 u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
6621 u64 owner, u64 offset, int no_quota) 6807 u64 owner, u64 offset)
6622{ 6808{
6623 int ret; 6809 int ret;
6624 struct btrfs_fs_info *fs_info = root->fs_info; 6810 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -6641,13 +6827,13 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
6641 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr, 6827 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
6642 num_bytes, 6828 num_bytes,
6643 parent, root_objectid, (int)owner, 6829 parent, root_objectid, (int)owner,
6644 BTRFS_DROP_DELAYED_REF, NULL, no_quota); 6830 BTRFS_DROP_DELAYED_REF, NULL);
6645 } else { 6831 } else {
6646 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr, 6832 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
6647 num_bytes, 6833 num_bytes,
6648 parent, root_objectid, owner, 6834 parent, root_objectid, owner,
6649 offset, BTRFS_DROP_DELAYED_REF, 6835 offset, 0,
6650 NULL, no_quota); 6836 BTRFS_DROP_DELAYED_REF, NULL);
6651 } 6837 }
6652 return ret; 6838 return ret;
6653} 6839}
@@ -6833,7 +7019,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6833 struct btrfs_block_group_cache *block_group = NULL; 7019 struct btrfs_block_group_cache *block_group = NULL;
6834 u64 search_start = 0; 7020 u64 search_start = 0;
6835 u64 max_extent_size = 0; 7021 u64 max_extent_size = 0;
6836 int empty_cluster = 2 * 1024 * 1024; 7022 u64 empty_cluster = 0;
6837 struct btrfs_space_info *space_info; 7023 struct btrfs_space_info *space_info;
6838 int loop = 0; 7024 int loop = 0;
6839 int index = __get_raid_index(flags); 7025 int index = __get_raid_index(flags);
@@ -6843,6 +7029,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6843 bool failed_alloc = false; 7029 bool failed_alloc = false;
6844 bool use_cluster = true; 7030 bool use_cluster = true;
6845 bool have_caching_bg = false; 7031 bool have_caching_bg = false;
7032 bool orig_have_caching_bg = false;
7033 bool full_search = false;
6846 7034
6847 WARN_ON(num_bytes < root->sectorsize); 7035 WARN_ON(num_bytes < root->sectorsize);
6848 ins->type = BTRFS_EXTENT_ITEM_KEY; 7036 ins->type = BTRFS_EXTENT_ITEM_KEY;
@@ -6858,36 +7046,47 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6858 } 7046 }
6859 7047
6860 /* 7048 /*
6861 * If the space info is for both data and metadata it means we have a 7049 * If our free space is heavily fragmented we may not be able to make
6862 * small filesystem and we can't use the clustering stuff. 7050 * big contiguous allocations, so instead of doing the expensive search
7051 * for free space, simply return ENOSPC with our max_extent_size so we
7052 * can go ahead and search for a more manageable chunk.
7053 *
7054 * If our max_extent_size is large enough for our allocation simply
7055 * disable clustering since we will likely not be able to find enough
7056 * space to create a cluster and induce latency trying.
6863 */ 7057 */
6864 if (btrfs_mixed_space_info(space_info)) 7058 if (unlikely(space_info->max_extent_size)) {
6865 use_cluster = false; 7059 spin_lock(&space_info->lock);
6866 7060 if (space_info->max_extent_size &&
6867 if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) { 7061 num_bytes > space_info->max_extent_size) {
6868 last_ptr = &root->fs_info->meta_alloc_cluster; 7062 ins->offset = space_info->max_extent_size;
6869 if (!btrfs_test_opt(root, SSD)) 7063 spin_unlock(&space_info->lock);
6870 empty_cluster = 64 * 1024; 7064 return -ENOSPC;
6871 } 7065 } else if (space_info->max_extent_size) {
6872 7066 use_cluster = false;
6873 if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster && 7067 }
6874 btrfs_test_opt(root, SSD)) { 7068 spin_unlock(&space_info->lock);
6875 last_ptr = &root->fs_info->data_alloc_cluster;
6876 } 7069 }
6877 7070
7071 last_ptr = fetch_cluster_info(orig_root, space_info, &empty_cluster);
6878 if (last_ptr) { 7072 if (last_ptr) {
6879 spin_lock(&last_ptr->lock); 7073 spin_lock(&last_ptr->lock);
6880 if (last_ptr->block_group) 7074 if (last_ptr->block_group)
6881 hint_byte = last_ptr->window_start; 7075 hint_byte = last_ptr->window_start;
7076 if (last_ptr->fragmented) {
7077 /*
7078 * We still set window_start so we can keep track of the
7079 * last place we found an allocation to try and save
7080 * some time.
7081 */
7082 hint_byte = last_ptr->window_start;
7083 use_cluster = false;
7084 }
6882 spin_unlock(&last_ptr->lock); 7085 spin_unlock(&last_ptr->lock);
6883 } 7086 }
6884 7087
6885 search_start = max(search_start, first_logical_byte(root, 0)); 7088 search_start = max(search_start, first_logical_byte(root, 0));
6886 search_start = max(search_start, hint_byte); 7089 search_start = max(search_start, hint_byte);
6887
6888 if (!last_ptr)
6889 empty_cluster = 0;
6890
6891 if (search_start == hint_byte) { 7090 if (search_start == hint_byte) {
6892 block_group = btrfs_lookup_block_group(root->fs_info, 7091 block_group = btrfs_lookup_block_group(root->fs_info,
6893 search_start); 7092 search_start);
@@ -6922,6 +7121,8 @@ static noinline int find_free_extent(struct btrfs_root *orig_root,
6922 } 7121 }
6923search: 7122search:
6924 have_caching_bg = false; 7123 have_caching_bg = false;
7124 if (index == 0 || index == __get_raid_index(flags))
7125 full_search = true;
6925 down_read(&space_info->groups_sem); 7126 down_read(&space_info->groups_sem);
6926 list_for_each_entry(block_group, &space_info->block_groups[index], 7127 list_for_each_entry(block_group, &space_info->block_groups[index],
6927 list) { 7128 list) {
@@ -6955,6 +7156,7 @@ search:
6955have_block_group: 7156have_block_group:
6956 cached = block_group_cache_done(block_group); 7157 cached = block_group_cache_done(block_group);
6957 if (unlikely(!cached)) { 7158 if (unlikely(!cached)) {
7159 have_caching_bg = true;
6958 ret = cache_block_group(block_group, 0); 7160 ret = cache_block_group(block_group, 0);
6959 BUG_ON(ret < 0); 7161 BUG_ON(ret < 0);
6960 ret = 0; 7162 ret = 0;
@@ -6969,7 +7171,7 @@ have_block_group:
6969 * Ok we want to try and use the cluster allocator, so 7171 * Ok we want to try and use the cluster allocator, so
6970 * lets look there 7172 * lets look there
6971 */ 7173 */
6972 if (last_ptr) { 7174 if (last_ptr && use_cluster) {
6973 struct btrfs_block_group_cache *used_block_group; 7175 struct btrfs_block_group_cache *used_block_group;
6974 unsigned long aligned_cluster; 7176 unsigned long aligned_cluster;
6975 /* 7177 /*
@@ -7095,6 +7297,16 @@ refill_cluster:
7095 } 7297 }
7096 7298
7097unclustered_alloc: 7299unclustered_alloc:
7300 /*
7301 * We are doing an unclustered alloc, set the fragmented flag so
7302 * we don't bother trying to setup a cluster again until we get
7303 * more space.
7304 */
7305 if (unlikely(last_ptr)) {
7306 spin_lock(&last_ptr->lock);
7307 last_ptr->fragmented = 1;
7308 spin_unlock(&last_ptr->lock);
7309 }
7098 spin_lock(&block_group->free_space_ctl->tree_lock); 7310 spin_lock(&block_group->free_space_ctl->tree_lock);
7099 if (cached && 7311 if (cached &&
7100 block_group->free_space_ctl->free_space < 7312 block_group->free_space_ctl->free_space <
@@ -7127,8 +7339,6 @@ unclustered_alloc:
7127 failed_alloc = true; 7339 failed_alloc = true;
7128 goto have_block_group; 7340 goto have_block_group;
7129 } else if (!offset) { 7341 } else if (!offset) {
7130 if (!cached)
7131 have_caching_bg = true;
7132 goto loop; 7342 goto loop;
7133 } 7343 }
7134checks: 7344checks:
@@ -7169,6 +7379,10 @@ loop:
7169 } 7379 }
7170 up_read(&space_info->groups_sem); 7380 up_read(&space_info->groups_sem);
7171 7381
7382 if ((loop == LOOP_CACHING_NOWAIT) && have_caching_bg
7383 && !orig_have_caching_bg)
7384 orig_have_caching_bg = true;
7385
7172 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg) 7386 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
7173 goto search; 7387 goto search;
7174 7388
@@ -7185,7 +7399,20 @@ loop:
7185 */ 7399 */
7186 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) { 7400 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
7187 index = 0; 7401 index = 0;
7188 loop++; 7402 if (loop == LOOP_CACHING_NOWAIT) {
7403 /*
7404 * We want to skip the LOOP_CACHING_WAIT step if we
7405 * don't have any unached bgs and we've alrelady done a
7406 * full search through.
7407 */
7408 if (orig_have_caching_bg || !full_search)
7409 loop = LOOP_CACHING_WAIT;
7410 else
7411 loop = LOOP_ALLOC_CHUNK;
7412 } else {
7413 loop++;
7414 }
7415
7189 if (loop == LOOP_ALLOC_CHUNK) { 7416 if (loop == LOOP_ALLOC_CHUNK) {
7190 struct btrfs_trans_handle *trans; 7417 struct btrfs_trans_handle *trans;
7191 int exist = 0; 7418 int exist = 0;
@@ -7203,6 +7430,15 @@ loop:
7203 7430
7204 ret = do_chunk_alloc(trans, root, flags, 7431 ret = do_chunk_alloc(trans, root, flags,
7205 CHUNK_ALLOC_FORCE); 7432 CHUNK_ALLOC_FORCE);
7433
7434 /*
7435 * If we can't allocate a new chunk we've already looped
7436 * through at least once, move on to the NO_EMPTY_SIZE
7437 * case.
7438 */
7439 if (ret == -ENOSPC)
7440 loop = LOOP_NO_EMPTY_SIZE;
7441
7206 /* 7442 /*
7207 * Do not bail out on ENOSPC since we 7443 * Do not bail out on ENOSPC since we
7208 * can do more things. 7444 * can do more things.
@@ -7219,6 +7455,15 @@ loop:
7219 } 7455 }
7220 7456
7221 if (loop == LOOP_NO_EMPTY_SIZE) { 7457 if (loop == LOOP_NO_EMPTY_SIZE) {
7458 /*
7459 * Don't loop again if we already have no empty_size and
7460 * no empty_cluster.
7461 */
7462 if (empty_size == 0 &&
7463 empty_cluster == 0) {
7464 ret = -ENOSPC;
7465 goto out;
7466 }
7222 empty_size = 0; 7467 empty_size = 0;
7223 empty_cluster = 0; 7468 empty_cluster = 0;
7224 } 7469 }
@@ -7227,11 +7472,20 @@ loop:
7227 } else if (!ins->objectid) { 7472 } else if (!ins->objectid) {
7228 ret = -ENOSPC; 7473 ret = -ENOSPC;
7229 } else if (ins->objectid) { 7474 } else if (ins->objectid) {
7475 if (!use_cluster && last_ptr) {
7476 spin_lock(&last_ptr->lock);
7477 last_ptr->window_start = ins->objectid;
7478 spin_unlock(&last_ptr->lock);
7479 }
7230 ret = 0; 7480 ret = 0;
7231 } 7481 }
7232out: 7482out:
7233 if (ret == -ENOSPC) 7483 if (ret == -ENOSPC) {
7484 spin_lock(&space_info->lock);
7485 space_info->max_extent_size = max_extent_size;
7486 spin_unlock(&space_info->lock);
7234 ins->offset = max_extent_size; 7487 ins->offset = max_extent_size;
7488 }
7235 return ret; 7489 return ret;
7236} 7490}
7237 7491
@@ -7280,7 +7534,7 @@ int btrfs_reserve_extent(struct btrfs_root *root,
7280 u64 empty_size, u64 hint_byte, 7534 u64 empty_size, u64 hint_byte,
7281 struct btrfs_key *ins, int is_data, int delalloc) 7535 struct btrfs_key *ins, int is_data, int delalloc)
7282{ 7536{
7283 bool final_tried = false; 7537 bool final_tried = num_bytes == min_alloc_size;
7284 u64 flags; 7538 u64 flags;
7285 int ret; 7539 int ret;
7286 7540
@@ -7429,8 +7683,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7429 struct btrfs_root *root, 7683 struct btrfs_root *root,
7430 u64 parent, u64 root_objectid, 7684 u64 parent, u64 root_objectid,
7431 u64 flags, struct btrfs_disk_key *key, 7685 u64 flags, struct btrfs_disk_key *key,
7432 int level, struct btrfs_key *ins, 7686 int level, struct btrfs_key *ins)
7433 int no_quota)
7434{ 7687{
7435 int ret; 7688 int ret;
7436 struct btrfs_fs_info *fs_info = root->fs_info; 7689 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -7511,7 +7764,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
7511int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 7764int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7512 struct btrfs_root *root, 7765 struct btrfs_root *root,
7513 u64 root_objectid, u64 owner, 7766 u64 root_objectid, u64 owner,
7514 u64 offset, struct btrfs_key *ins) 7767 u64 offset, u64 ram_bytes,
7768 struct btrfs_key *ins)
7515{ 7769{
7516 int ret; 7770 int ret;
7517 7771
@@ -7520,7 +7774,8 @@ int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
7520 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid, 7774 ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
7521 ins->offset, 0, 7775 ins->offset, 0,
7522 root_objectid, owner, offset, 7776 root_objectid, owner, offset,
7523 BTRFS_ADD_DELAYED_EXTENT, NULL, 0); 7777 ram_bytes, BTRFS_ADD_DELAYED_EXTENT,
7778 NULL);
7524 return ret; 7779 return ret;
7525} 7780}
7526 7781
@@ -7734,7 +7989,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
7734 ins.objectid, ins.offset, 7989 ins.objectid, ins.offset,
7735 parent, root_objectid, level, 7990 parent, root_objectid, level,
7736 BTRFS_ADD_DELAYED_EXTENT, 7991 BTRFS_ADD_DELAYED_EXTENT,
7737 extent_op, 0); 7992 extent_op);
7738 if (ret) 7993 if (ret)
7739 goto out_free_delayed; 7994 goto out_free_delayed;
7740 } 7995 }
@@ -8275,14 +8530,15 @@ skip:
8275 ret = account_shared_subtree(trans, root, next, 8530 ret = account_shared_subtree(trans, root, next,
8276 generation, level - 1); 8531 generation, level - 1);
8277 if (ret) { 8532 if (ret) {
8278 printk_ratelimited(KERN_ERR "BTRFS: %s Error " 8533 btrfs_err_rl(root->fs_info,
8534 "Error "
8279 "%d accounting shared subtree. Quota " 8535 "%d accounting shared subtree. Quota "
8280 "is out of sync, rescan required.\n", 8536 "is out of sync, rescan required.",
8281 root->fs_info->sb->s_id, ret); 8537 ret);
8282 } 8538 }
8283 } 8539 }
8284 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent, 8540 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
8285 root->root_key.objectid, level - 1, 0, 0); 8541 root->root_key.objectid, level - 1, 0);
8286 BUG_ON(ret); /* -ENOMEM */ 8542 BUG_ON(ret); /* -ENOMEM */
8287 } 8543 }
8288 btrfs_tree_unlock(next); 8544 btrfs_tree_unlock(next);
@@ -8367,10 +8623,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
8367 BUG_ON(ret); /* -ENOMEM */ 8623 BUG_ON(ret); /* -ENOMEM */
8368 ret = account_leaf_items(trans, root, eb); 8624 ret = account_leaf_items(trans, root, eb);
8369 if (ret) { 8625 if (ret) {
8370 printk_ratelimited(KERN_ERR "BTRFS: %s Error " 8626 btrfs_err_rl(root->fs_info,
8627 "error "
8371 "%d accounting leaf items. Quota " 8628 "%d accounting leaf items. Quota "
8372 "is out of sync, rescan required.\n", 8629 "is out of sync, rescan required.",
8373 root->fs_info->sb->s_id, ret); 8630 ret);
8374 } 8631 }
8375 } 8632 }
8376 /* make block locked assertion in clean_tree_block happy */ 8633 /* make block locked assertion in clean_tree_block happy */
@@ -8692,7 +8949,7 @@ out:
8692 if (!for_reloc && root_dropped == false) 8949 if (!for_reloc && root_dropped == false)
8693 btrfs_add_dead_root(root); 8950 btrfs_add_dead_root(root);
8694 if (err && err != -EAGAIN) 8951 if (err && err != -EAGAIN)
8695 btrfs_std_error(root->fs_info, err); 8952 btrfs_std_error(root->fs_info, err, NULL);
8696 return err; 8953 return err;
8697} 8954}
8698 8955
@@ -8880,7 +9137,7 @@ again:
8880 * back off and let this transaction commit 9137 * back off and let this transaction commit
8881 */ 9138 */
8882 mutex_lock(&root->fs_info->ro_block_group_mutex); 9139 mutex_lock(&root->fs_info->ro_block_group_mutex);
8883 if (trans->transaction->dirty_bg_run) { 9140 if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
8884 u64 transid = trans->transid; 9141 u64 transid = trans->transid;
8885 9142
8886 mutex_unlock(&root->fs_info->ro_block_group_mutex); 9143 mutex_unlock(&root->fs_info->ro_block_group_mutex);
@@ -9630,6 +9887,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
9630 9887
9631 free_excluded_extents(root, cache); 9888 free_excluded_extents(root, cache);
9632 9889
9890#ifdef CONFIG_BTRFS_DEBUG
9891 if (btrfs_should_fragment_free_space(root, cache)) {
9892 u64 new_bytes_used = size - bytes_used;
9893
9894 bytes_used += new_bytes_used >> 1;
9895 fragment_free_space(root, cache);
9896 }
9897#endif
9633 /* 9898 /*
9634 * Call to ensure the corresponding space_info object is created and 9899 * Call to ensure the corresponding space_info object is created and
9635 * assigned to our block group, but don't update its counters just yet. 9900 * assigned to our block group, but don't update its counters just yet.
@@ -10370,8 +10635,7 @@ void btrfs_end_write_no_snapshoting(struct btrfs_root *root)
10370{ 10635{
10371 percpu_counter_dec(&root->subv_writers->counter); 10636 percpu_counter_dec(&root->subv_writers->counter);
10372 /* 10637 /*
10373 * Make sure counter is updated before we wake up 10638 * Make sure counter is updated before we wake up waiters.
10374 * waiters.
10375 */ 10639 */
10376 smp_mb(); 10640 smp_mb();
10377 if (waitqueue_active(&root->subv_writers->wait)) 10641 if (waitqueue_active(&root->subv_writers->wait))
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3915c9473e94..33a01ea41465 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -96,8 +96,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
96 inode = tree->mapping->host; 96 inode = tree->mapping->host;
97 isize = i_size_read(inode); 97 isize = i_size_read(inode);
98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { 98 if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
99 printk_ratelimited(KERN_DEBUG 99 btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
100 "BTRFS: %s: ino %llu isize %llu odd range [%llu,%llu]\n", 100 "%s: ino %llu isize %llu odd range [%llu,%llu]",
101 caller, btrfs_ino(inode), isize, start, end); 101 caller, btrfs_ino(inode), isize, start, end);
102 } 102 }
103} 103}
@@ -131,6 +131,25 @@ struct extent_page_data {
131 unsigned int sync_io:1; 131 unsigned int sync_io:1;
132}; 132};
133 133
134static void add_extent_changeset(struct extent_state *state, unsigned bits,
135 struct extent_changeset *changeset,
136 int set)
137{
138 int ret;
139
140 if (!changeset)
141 return;
142 if (set && (state->state & bits) == bits)
143 return;
144 if (!set && (state->state & bits) == 0)
145 return;
146 changeset->bytes_changed += state->end - state->start + 1;
147 ret = ulist_add(changeset->range_changed, state->start, state->end,
148 GFP_ATOMIC);
149 /* ENOMEM */
150 BUG_ON(ret < 0);
151}
152
134static noinline void flush_write_bio(void *data); 153static noinline void flush_write_bio(void *data);
135static inline struct btrfs_fs_info * 154static inline struct btrfs_fs_info *
136tree_fs_info(struct extent_io_tree *tree) 155tree_fs_info(struct extent_io_tree *tree)
@@ -410,7 +429,8 @@ static void clear_state_cb(struct extent_io_tree *tree,
410} 429}
411 430
412static void set_state_bits(struct extent_io_tree *tree, 431static void set_state_bits(struct extent_io_tree *tree,
413 struct extent_state *state, unsigned *bits); 432 struct extent_state *state, unsigned *bits,
433 struct extent_changeset *changeset);
414 434
415/* 435/*
416 * insert an extent_state struct into the tree. 'bits' are set on the 436 * insert an extent_state struct into the tree. 'bits' are set on the
@@ -426,7 +446,7 @@ static int insert_state(struct extent_io_tree *tree,
426 struct extent_state *state, u64 start, u64 end, 446 struct extent_state *state, u64 start, u64 end,
427 struct rb_node ***p, 447 struct rb_node ***p,
428 struct rb_node **parent, 448 struct rb_node **parent,
429 unsigned *bits) 449 unsigned *bits, struct extent_changeset *changeset)
430{ 450{
431 struct rb_node *node; 451 struct rb_node *node;
432 452
@@ -436,7 +456,7 @@ static int insert_state(struct extent_io_tree *tree,
436 state->start = start; 456 state->start = start;
437 state->end = end; 457 state->end = end;
438 458
439 set_state_bits(tree, state, bits); 459 set_state_bits(tree, state, bits, changeset);
440 460
441 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); 461 node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent);
442 if (node) { 462 if (node) {
@@ -511,7 +531,8 @@ static struct extent_state *next_state(struct extent_state *state)
511 */ 531 */
512static struct extent_state *clear_state_bit(struct extent_io_tree *tree, 532static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
513 struct extent_state *state, 533 struct extent_state *state,
514 unsigned *bits, int wake) 534 unsigned *bits, int wake,
535 struct extent_changeset *changeset)
515{ 536{
516 struct extent_state *next; 537 struct extent_state *next;
517 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS; 538 unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
@@ -522,6 +543,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
522 tree->dirty_bytes -= range; 543 tree->dirty_bytes -= range;
523 } 544 }
524 clear_state_cb(tree, state, bits); 545 clear_state_cb(tree, state, bits);
546 add_extent_changeset(state, bits_to_clear, changeset, 0);
525 state->state &= ~bits_to_clear; 547 state->state &= ~bits_to_clear;
526 if (wake) 548 if (wake)
527 wake_up(&state->wq); 549 wake_up(&state->wq);
@@ -569,10 +591,10 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
569 * 591 *
570 * This takes the tree lock, and returns 0 on success and < 0 on error. 592 * This takes the tree lock, and returns 0 on success and < 0 on error.
571 */ 593 */
572int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 594static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
573 unsigned bits, int wake, int delete, 595 unsigned bits, int wake, int delete,
574 struct extent_state **cached_state, 596 struct extent_state **cached_state,
575 gfp_t mask) 597 gfp_t mask, struct extent_changeset *changeset)
576{ 598{
577 struct extent_state *state; 599 struct extent_state *state;
578 struct extent_state *cached; 600 struct extent_state *cached;
@@ -671,7 +693,8 @@ hit_next:
671 if (err) 693 if (err)
672 goto out; 694 goto out;
673 if (state->end <= end) { 695 if (state->end <= end) {
674 state = clear_state_bit(tree, state, &bits, wake); 696 state = clear_state_bit(tree, state, &bits, wake,
697 changeset);
675 goto next; 698 goto next;
676 } 699 }
677 goto search_again; 700 goto search_again;
@@ -692,13 +715,13 @@ hit_next:
692 if (wake) 715 if (wake)
693 wake_up(&state->wq); 716 wake_up(&state->wq);
694 717
695 clear_state_bit(tree, prealloc, &bits, wake); 718 clear_state_bit(tree, prealloc, &bits, wake, changeset);
696 719
697 prealloc = NULL; 720 prealloc = NULL;
698 goto out; 721 goto out;
699 } 722 }
700 723
701 state = clear_state_bit(tree, state, &bits, wake); 724 state = clear_state_bit(tree, state, &bits, wake, changeset);
702next: 725next:
703 if (last_end == (u64)-1) 726 if (last_end == (u64)-1)
704 goto out; 727 goto out;
@@ -789,7 +812,7 @@ out:
789 812
790static void set_state_bits(struct extent_io_tree *tree, 813static void set_state_bits(struct extent_io_tree *tree,
791 struct extent_state *state, 814 struct extent_state *state,
792 unsigned *bits) 815 unsigned *bits, struct extent_changeset *changeset)
793{ 816{
794 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS; 817 unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
795 818
@@ -798,6 +821,7 @@ static void set_state_bits(struct extent_io_tree *tree,
798 u64 range = state->end - state->start + 1; 821 u64 range = state->end - state->start + 1;
799 tree->dirty_bytes += range; 822 tree->dirty_bytes += range;
800 } 823 }
824 add_extent_changeset(state, bits_to_set, changeset, 1);
801 state->state |= bits_to_set; 825 state->state |= bits_to_set;
802} 826}
803 827
@@ -835,7 +859,7 @@ static int __must_check
835__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 859__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
836 unsigned bits, unsigned exclusive_bits, 860 unsigned bits, unsigned exclusive_bits,
837 u64 *failed_start, struct extent_state **cached_state, 861 u64 *failed_start, struct extent_state **cached_state,
838 gfp_t mask) 862 gfp_t mask, struct extent_changeset *changeset)
839{ 863{
840 struct extent_state *state; 864 struct extent_state *state;
841 struct extent_state *prealloc = NULL; 865 struct extent_state *prealloc = NULL;
@@ -873,7 +897,7 @@ again:
873 prealloc = alloc_extent_state_atomic(prealloc); 897 prealloc = alloc_extent_state_atomic(prealloc);
874 BUG_ON(!prealloc); 898 BUG_ON(!prealloc);
875 err = insert_state(tree, prealloc, start, end, 899 err = insert_state(tree, prealloc, start, end,
876 &p, &parent, &bits); 900 &p, &parent, &bits, changeset);
877 if (err) 901 if (err)
878 extent_io_tree_panic(tree, err); 902 extent_io_tree_panic(tree, err);
879 903
@@ -899,7 +923,7 @@ hit_next:
899 goto out; 923 goto out;
900 } 924 }
901 925
902 set_state_bits(tree, state, &bits); 926 set_state_bits(tree, state, &bits, changeset);
903 cache_state(state, cached_state); 927 cache_state(state, cached_state);
904 merge_state(tree, state); 928 merge_state(tree, state);
905 if (last_end == (u64)-1) 929 if (last_end == (u64)-1)
@@ -945,7 +969,7 @@ hit_next:
945 if (err) 969 if (err)
946 goto out; 970 goto out;
947 if (state->end <= end) { 971 if (state->end <= end) {
948 set_state_bits(tree, state, &bits); 972 set_state_bits(tree, state, &bits, changeset);
949 cache_state(state, cached_state); 973 cache_state(state, cached_state);
950 merge_state(tree, state); 974 merge_state(tree, state);
951 if (last_end == (u64)-1) 975 if (last_end == (u64)-1)
@@ -980,7 +1004,7 @@ hit_next:
980 * the later extent. 1004 * the later extent.
981 */ 1005 */
982 err = insert_state(tree, prealloc, start, this_end, 1006 err = insert_state(tree, prealloc, start, this_end,
983 NULL, NULL, &bits); 1007 NULL, NULL, &bits, changeset);
984 if (err) 1008 if (err)
985 extent_io_tree_panic(tree, err); 1009 extent_io_tree_panic(tree, err);
986 1010
@@ -1008,7 +1032,7 @@ hit_next:
1008 if (err) 1032 if (err)
1009 extent_io_tree_panic(tree, err); 1033 extent_io_tree_panic(tree, err);
1010 1034
1011 set_state_bits(tree, prealloc, &bits); 1035 set_state_bits(tree, prealloc, &bits, changeset);
1012 cache_state(prealloc, cached_state); 1036 cache_state(prealloc, cached_state);
1013 merge_state(tree, prealloc); 1037 merge_state(tree, prealloc);
1014 prealloc = NULL; 1038 prealloc = NULL;
@@ -1038,7 +1062,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1038 struct extent_state **cached_state, gfp_t mask) 1062 struct extent_state **cached_state, gfp_t mask)
1039{ 1063{
1040 return __set_extent_bit(tree, start, end, bits, 0, failed_start, 1064 return __set_extent_bit(tree, start, end, bits, 0, failed_start,
1041 cached_state, mask); 1065 cached_state, mask, NULL);
1042} 1066}
1043 1067
1044 1068
@@ -1111,7 +1135,7 @@ again:
1111 goto out; 1135 goto out;
1112 } 1136 }
1113 err = insert_state(tree, prealloc, start, end, 1137 err = insert_state(tree, prealloc, start, end,
1114 &p, &parent, &bits); 1138 &p, &parent, &bits, NULL);
1115 if (err) 1139 if (err)
1116 extent_io_tree_panic(tree, err); 1140 extent_io_tree_panic(tree, err);
1117 cache_state(prealloc, cached_state); 1141 cache_state(prealloc, cached_state);
@@ -1130,9 +1154,9 @@ hit_next:
1130 * Just lock what we found and keep going 1154 * Just lock what we found and keep going
1131 */ 1155 */
1132 if (state->start == start && state->end <= end) { 1156 if (state->start == start && state->end <= end) {
1133 set_state_bits(tree, state, &bits); 1157 set_state_bits(tree, state, &bits, NULL);
1134 cache_state(state, cached_state); 1158 cache_state(state, cached_state);
1135 state = clear_state_bit(tree, state, &clear_bits, 0); 1159 state = clear_state_bit(tree, state, &clear_bits, 0, NULL);
1136 if (last_end == (u64)-1) 1160 if (last_end == (u64)-1)
1137 goto out; 1161 goto out;
1138 start = last_end + 1; 1162 start = last_end + 1;
@@ -1171,9 +1195,10 @@ hit_next:
1171 if (err) 1195 if (err)
1172 goto out; 1196 goto out;
1173 if (state->end <= end) { 1197 if (state->end <= end) {
1174 set_state_bits(tree, state, &bits); 1198 set_state_bits(tree, state, &bits, NULL);
1175 cache_state(state, cached_state); 1199 cache_state(state, cached_state);
1176 state = clear_state_bit(tree, state, &clear_bits, 0); 1200 state = clear_state_bit(tree, state, &clear_bits, 0,
1201 NULL);
1177 if (last_end == (u64)-1) 1202 if (last_end == (u64)-1)
1178 goto out; 1203 goto out;
1179 start = last_end + 1; 1204 start = last_end + 1;
@@ -1208,7 +1233,7 @@ hit_next:
1208 * the later extent. 1233 * the later extent.
1209 */ 1234 */
1210 err = insert_state(tree, prealloc, start, this_end, 1235 err = insert_state(tree, prealloc, start, this_end,
1211 NULL, NULL, &bits); 1236 NULL, NULL, &bits, NULL);
1212 if (err) 1237 if (err)
1213 extent_io_tree_panic(tree, err); 1238 extent_io_tree_panic(tree, err);
1214 cache_state(prealloc, cached_state); 1239 cache_state(prealloc, cached_state);
@@ -1233,9 +1258,9 @@ hit_next:
1233 if (err) 1258 if (err)
1234 extent_io_tree_panic(tree, err); 1259 extent_io_tree_panic(tree, err);
1235 1260
1236 set_state_bits(tree, prealloc, &bits); 1261 set_state_bits(tree, prealloc, &bits, NULL);
1237 cache_state(prealloc, cached_state); 1262 cache_state(prealloc, cached_state);
1238 clear_state_bit(tree, prealloc, &clear_bits, 0); 1263 clear_state_bit(tree, prealloc, &clear_bits, 0, NULL);
1239 prealloc = NULL; 1264 prealloc = NULL;
1240 goto out; 1265 goto out;
1241 } 1266 }
@@ -1274,6 +1299,30 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1274 NULL, mask); 1299 NULL, mask);
1275} 1300}
1276 1301
1302int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1303 unsigned bits, gfp_t mask,
1304 struct extent_changeset *changeset)
1305{
1306 /*
1307 * We don't support EXTENT_LOCKED yet, as current changeset will
1308 * record any bits changed, so for EXTENT_LOCKED case, it will
1309 * either fail with -EEXIST or changeset will record the whole
1310 * range.
1311 */
1312 BUG_ON(bits & EXTENT_LOCKED);
1313
1314 return __set_extent_bit(tree, start, end, bits, 0, NULL, NULL, mask,
1315 changeset);
1316}
1317
1318int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1319 unsigned bits, int wake, int delete,
1320 struct extent_state **cached, gfp_t mask)
1321{
1322 return __clear_extent_bit(tree, start, end, bits, wake, delete,
1323 cached, mask, NULL);
1324}
1325
1277int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 1326int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1278 unsigned bits, gfp_t mask) 1327 unsigned bits, gfp_t mask)
1279{ 1328{
@@ -1285,6 +1334,20 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1285 return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); 1334 return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
1286} 1335}
1287 1336
1337int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1338 unsigned bits, gfp_t mask,
1339 struct extent_changeset *changeset)
1340{
1341 /*
1342 * Don't support EXTENT_LOCKED case, same reason as
1343 * set_record_extent_bits().
1344 */
1345 BUG_ON(bits & EXTENT_LOCKED);
1346
1347 return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask,
1348 changeset);
1349}
1350
1288int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 1351int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1289 struct extent_state **cached_state, gfp_t mask) 1352 struct extent_state **cached_state, gfp_t mask)
1290{ 1353{
@@ -1343,7 +1406,7 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1343 while (1) { 1406 while (1) {
1344 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, 1407 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1345 EXTENT_LOCKED, &failed_start, 1408 EXTENT_LOCKED, &failed_start,
1346 cached_state, GFP_NOFS); 1409 cached_state, GFP_NOFS, NULL);
1347 if (err == -EEXIST) { 1410 if (err == -EEXIST) {
1348 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); 1411 wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1349 start = failed_start; 1412 start = failed_start;
@@ -1365,7 +1428,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1365 u64 failed_start; 1428 u64 failed_start;
1366 1429
1367 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, 1430 err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1368 &failed_start, NULL, GFP_NOFS); 1431 &failed_start, NULL, GFP_NOFS, NULL);
1369 if (err == -EEXIST) { 1432 if (err == -EEXIST) {
1370 if (failed_start > start) 1433 if (failed_start > start)
1371 clear_extent_bit(tree, start, failed_start - 1, 1434 clear_extent_bit(tree, start, failed_start - 1,
@@ -2078,8 +2141,8 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
2078 return -EIO; 2141 return -EIO;
2079 } 2142 }
2080 2143
2081 printk_ratelimited_in_rcu(KERN_INFO 2144 btrfs_info_rl_in_rcu(fs_info,
2082 "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n", 2145 "read error corrected: ino %llu off %llu (dev %s sector %llu)",
2083 btrfs_ino(inode), start, 2146 btrfs_ino(inode), start,
2084 rcu_str_deref(dev->name), sector); 2147 rcu_str_deref(dev->name), sector);
2085 bio_put(bio); 2148 bio_put(bio);
@@ -3070,8 +3133,12 @@ static int __do_readpage(struct extent_io_tree *tree,
3070 3133
3071 set_extent_uptodate(tree, cur, cur + iosize - 1, 3134 set_extent_uptodate(tree, cur, cur + iosize - 1,
3072 &cached, GFP_NOFS); 3135 &cached, GFP_NOFS);
3073 unlock_extent_cached(tree, cur, cur + iosize - 1, 3136 if (parent_locked)
3074 &cached, GFP_NOFS); 3137 free_extent_state(cached);
3138 else
3139 unlock_extent_cached(tree, cur,
3140 cur + iosize - 1,
3141 &cached, GFP_NOFS);
3075 cur = cur + iosize; 3142 cur = cur + iosize;
3076 pg_offset += iosize; 3143 pg_offset += iosize;
3077 continue; 3144 continue;
@@ -5566,13 +5633,15 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5566 unsigned long src_i; 5633 unsigned long src_i;
5567 5634
5568 if (src_offset + len > dst->len) { 5635 if (src_offset + len > dst->len) {
5569 printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " 5636 btrfs_err(dst->fs_info,
5570 "len %lu dst len %lu\n", src_offset, len, dst->len); 5637 "memmove bogus src_offset %lu move "
5638 "len %lu dst len %lu", src_offset, len, dst->len);
5571 BUG_ON(1); 5639 BUG_ON(1);
5572 } 5640 }
5573 if (dst_offset + len > dst->len) { 5641 if (dst_offset + len > dst->len) {
5574 printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " 5642 btrfs_err(dst->fs_info,
5575 "len %lu dst len %lu\n", dst_offset, len, dst->len); 5643 "memmove bogus dst_offset %lu move "
5644 "len %lu dst len %lu", dst_offset, len, dst->len);
5576 BUG_ON(1); 5645 BUG_ON(1);
5577 } 5646 }
5578 5647
@@ -5612,13 +5681,13 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
5612 unsigned long src_i; 5681 unsigned long src_i;
5613 5682
5614 if (src_offset + len > dst->len) { 5683 if (src_offset + len > dst->len) {
5615 printk(KERN_ERR "BTRFS: memmove bogus src_offset %lu move " 5684 btrfs_err(dst->fs_info, "memmove bogus src_offset %lu move "
5616 "len %lu len %lu\n", src_offset, len, dst->len); 5685 "len %lu len %lu", src_offset, len, dst->len);
5617 BUG_ON(1); 5686 BUG_ON(1);
5618 } 5687 }
5619 if (dst_offset + len > dst->len) { 5688 if (dst_offset + len > dst->len) {
5620 printk(KERN_ERR "BTRFS: memmove bogus dst_offset %lu move " 5689 btrfs_err(dst->fs_info, "memmove bogus dst_offset %lu move "
5621 "len %lu len %lu\n", dst_offset, len, dst->len); 5690 "len %lu len %lu", dst_offset, len, dst->len);
5622 BUG_ON(1); 5691 BUG_ON(1);
5623 } 5692 }
5624 if (dst_offset < src_offset) { 5693 if (dst_offset < src_offset) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index c668f36898d3..f4c1ae11855f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -2,6 +2,7 @@
2#define __EXTENTIO__ 2#define __EXTENTIO__
3 3
4#include <linux/rbtree.h> 4#include <linux/rbtree.h>
5#include "ulist.h"
5 6
6/* bits for the extent state */ 7/* bits for the extent state */
7#define EXTENT_DIRTY (1U << 0) 8#define EXTENT_DIRTY (1U << 0)
@@ -18,6 +19,7 @@
18#define EXTENT_NEED_WAIT (1U << 13) 19#define EXTENT_NEED_WAIT (1U << 13)
19#define EXTENT_DAMAGED (1U << 14) 20#define EXTENT_DAMAGED (1U << 14)
20#define EXTENT_NORESERVE (1U << 15) 21#define EXTENT_NORESERVE (1U << 15)
22#define EXTENT_QGROUP_RESERVED (1U << 16)
21#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 23#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
22#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 24#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
23 25
@@ -161,6 +163,17 @@ struct extent_buffer {
161#endif 163#endif
162}; 164};
163 165
166/*
167 * Structure to record how many bytes and which ranges are set/cleared
168 */
169struct extent_changeset {
170 /* How many bytes are set/cleared in this operation */
171 u64 bytes_changed;
172
173 /* Changed ranges */
174 struct ulist *range_changed;
175};
176
164static inline void extent_set_compress_type(unsigned long *bio_flags, 177static inline void extent_set_compress_type(unsigned long *bio_flags,
165 int compress_type) 178 int compress_type)
166{ 179{
@@ -210,11 +223,17 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
210 struct extent_state *cached_state); 223 struct extent_state *cached_state);
211int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 224int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
212 unsigned bits, gfp_t mask); 225 unsigned bits, gfp_t mask);
226int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
227 unsigned bits, gfp_t mask,
228 struct extent_changeset *changeset);
213int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 229int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
214 unsigned bits, int wake, int delete, 230 unsigned bits, int wake, int delete,
215 struct extent_state **cached, gfp_t mask); 231 struct extent_state **cached, gfp_t mask);
216int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 232int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
217 unsigned bits, gfp_t mask); 233 unsigned bits, gfp_t mask);
234int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
235 unsigned bits, gfp_t mask,
236 struct extent_changeset *changeset);
218int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 237int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
219 unsigned bits, u64 *failed_start, 238 unsigned bits, u64 *failed_start,
220 struct extent_state **cached_state, gfp_t mask); 239 struct extent_state **cached_state, gfp_t mask);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8c6f247ba81d..6bd5ce9d75f0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -847,7 +847,7 @@ next_slot:
847 disk_bytenr, num_bytes, 0, 847 disk_bytenr, num_bytes, 0,
848 root->root_key.objectid, 848 root->root_key.objectid,
849 new_key.objectid, 849 new_key.objectid,
850 start - extent_offset, 1); 850 start - extent_offset);
851 BUG_ON(ret); /* -ENOMEM */ 851 BUG_ON(ret); /* -ENOMEM */
852 } 852 }
853 key.offset = start; 853 key.offset = start;
@@ -925,7 +925,7 @@ delete_extent_item:
925 disk_bytenr, num_bytes, 0, 925 disk_bytenr, num_bytes, 0,
926 root->root_key.objectid, 926 root->root_key.objectid,
927 key.objectid, key.offset - 927 key.objectid, key.offset -
928 extent_offset, 0); 928 extent_offset);
929 BUG_ON(ret); /* -ENOMEM */ 929 BUG_ON(ret); /* -ENOMEM */
930 inode_sub_bytes(inode, 930 inode_sub_bytes(inode,
931 extent_end - key.offset); 931 extent_end - key.offset);
@@ -1204,7 +1204,7 @@ again:
1204 1204
1205 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 1205 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
1206 root->root_key.objectid, 1206 root->root_key.objectid,
1207 ino, orig_offset, 1); 1207 ino, orig_offset);
1208 BUG_ON(ret); /* -ENOMEM */ 1208 BUG_ON(ret); /* -ENOMEM */
1209 1209
1210 if (split == start) { 1210 if (split == start) {
@@ -1231,7 +1231,7 @@ again:
1231 del_nr++; 1231 del_nr++;
1232 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1232 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1233 0, root->root_key.objectid, 1233 0, root->root_key.objectid,
1234 ino, orig_offset, 0); 1234 ino, orig_offset);
1235 BUG_ON(ret); /* -ENOMEM */ 1235 BUG_ON(ret); /* -ENOMEM */
1236 } 1236 }
1237 other_start = 0; 1237 other_start = 0;
@@ -1248,7 +1248,7 @@ again:
1248 del_nr++; 1248 del_nr++;
1249 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1249 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1250 0, root->root_key.objectid, 1250 0, root->root_key.objectid,
1251 ino, orig_offset, 0); 1251 ino, orig_offset);
1252 BUG_ON(ret); /* -ENOMEM */ 1252 BUG_ON(ret); /* -ENOMEM */
1253 } 1253 }
1254 if (del_nr == 0) { 1254 if (del_nr == 0) {
@@ -1469,7 +1469,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1469 u64 release_bytes = 0; 1469 u64 release_bytes = 0;
1470 u64 lockstart; 1470 u64 lockstart;
1471 u64 lockend; 1471 u64 lockend;
1472 unsigned long first_index;
1473 size_t num_written = 0; 1472 size_t num_written = 0;
1474 int nrptrs; 1473 int nrptrs;
1475 int ret = 0; 1474 int ret = 0;
@@ -1485,8 +1484,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1485 if (!pages) 1484 if (!pages)
1486 return -ENOMEM; 1485 return -ENOMEM;
1487 1486
1488 first_index = pos >> PAGE_CACHE_SHIFT;
1489
1490 while (iov_iter_count(i) > 0) { 1487 while (iov_iter_count(i) > 0) {
1491 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1488 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1492 size_t write_bytes = min(iov_iter_count(i), 1489 size_t write_bytes = min(iov_iter_count(i),
@@ -1510,12 +1507,17 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1510 } 1507 }
1511 1508
1512 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1509 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1513 ret = btrfs_check_data_free_space(inode, reserve_bytes, write_bytes); 1510
1514 if (ret == -ENOSPC && 1511 if (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
1515 (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | 1512 BTRFS_INODE_PREALLOC)) {
1516 BTRFS_INODE_PREALLOC))) {
1517 ret = check_can_nocow(inode, pos, &write_bytes); 1513 ret = check_can_nocow(inode, pos, &write_bytes);
1514 if (ret < 0)
1515 break;
1518 if (ret > 0) { 1516 if (ret > 0) {
1517 /*
1518 * For nodata cow case, no need to reserve
1519 * data space.
1520 */
1519 only_release_metadata = true; 1521 only_release_metadata = true;
1520 /* 1522 /*
1521 * our prealloc extent may be smaller than 1523 * our prealloc extent may be smaller than
@@ -1524,20 +1526,19 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1524 num_pages = DIV_ROUND_UP(write_bytes + offset, 1526 num_pages = DIV_ROUND_UP(write_bytes + offset,
1525 PAGE_CACHE_SIZE); 1527 PAGE_CACHE_SIZE);
1526 reserve_bytes = num_pages << PAGE_CACHE_SHIFT; 1528 reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
1527 ret = 0; 1529 goto reserve_metadata;
1528 } else {
1529 ret = -ENOSPC;
1530 } 1530 }
1531 } 1531 }
1532 1532 ret = btrfs_check_data_free_space(inode, pos, write_bytes);
1533 if (ret) 1533 if (ret < 0)
1534 break; 1534 break;
1535 1535
1536reserve_metadata:
1536 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes); 1537 ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
1537 if (ret) { 1538 if (ret) {
1538 if (!only_release_metadata) 1539 if (!only_release_metadata)
1539 btrfs_free_reserved_data_space(inode, 1540 btrfs_free_reserved_data_space(inode, pos,
1540 reserve_bytes); 1541 write_bytes);
1541 else 1542 else
1542 btrfs_end_write_no_snapshoting(root); 1543 btrfs_end_write_no_snapshoting(root);
1543 break; 1544 break;
@@ -1603,12 +1604,17 @@ again:
1603 BTRFS_I(inode)->outstanding_extents++; 1604 BTRFS_I(inode)->outstanding_extents++;
1604 spin_unlock(&BTRFS_I(inode)->lock); 1605 spin_unlock(&BTRFS_I(inode)->lock);
1605 } 1606 }
1606 if (only_release_metadata) 1607 if (only_release_metadata) {
1607 btrfs_delalloc_release_metadata(inode, 1608 btrfs_delalloc_release_metadata(inode,
1608 release_bytes); 1609 release_bytes);
1609 else 1610 } else {
1610 btrfs_delalloc_release_space(inode, 1611 u64 __pos;
1612
1613 __pos = round_down(pos, root->sectorsize) +
1614 (dirty_pages << PAGE_CACHE_SHIFT);
1615 btrfs_delalloc_release_space(inode, __pos,
1611 release_bytes); 1616 release_bytes);
1617 }
1612 } 1618 }
1613 1619
1614 release_bytes = dirty_pages << PAGE_CACHE_SHIFT; 1620 release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
@@ -1660,7 +1666,7 @@ again:
1660 btrfs_end_write_no_snapshoting(root); 1666 btrfs_end_write_no_snapshoting(root);
1661 btrfs_delalloc_release_metadata(inode, release_bytes); 1667 btrfs_delalloc_release_metadata(inode, release_bytes);
1662 } else { 1668 } else {
1663 btrfs_delalloc_release_space(inode, release_bytes); 1669 btrfs_delalloc_release_space(inode, pos, release_bytes);
1664 } 1670 }
1665 } 1671 }
1666 1672
@@ -2266,7 +2272,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2266 u64 drop_end; 2272 u64 drop_end;
2267 int ret = 0; 2273 int ret = 0;
2268 int err = 0; 2274 int err = 0;
2269 int rsv_count; 2275 unsigned int rsv_count;
2270 bool same_page; 2276 bool same_page;
2271 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES); 2277 bool no_holes = btrfs_fs_incompat(root->fs_info, NO_HOLES);
2272 u64 ino_size; 2278 u64 ino_size;
@@ -2488,6 +2494,19 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2488 2494
2489 trans->block_rsv = &root->fs_info->trans_block_rsv; 2495 trans->block_rsv = &root->fs_info->trans_block_rsv;
2490 /* 2496 /*
2497 * If we are using the NO_HOLES feature we might have had already an
2498 * hole that overlaps a part of the region [lockstart, lockend] and
2499 * ends at (or beyond) lockend. Since we have no file extent items to
2500 * represent holes, drop_end can be less than lockend and so we must
2501 * make sure we have an extent map representing the existing hole (the
2502 * call to __btrfs_drop_extents() might have dropped the existing extent
2503 * map representing the existing hole), otherwise the fast fsync path
2504 * will not record the existence of the hole region
2505 * [existing_hole_start, lockend].
2506 */
2507 if (drop_end <= lockend)
2508 drop_end = lockend + 1;
2509 /*
2491 * Don't insert file hole extent item if it's for a range beyond eof 2510 * Don't insert file hole extent item if it's for a range beyond eof
2492 * (because it's useless) or if it represents a 0 bytes range (when 2511 * (because it's useless) or if it represents a 0 bytes range (when
2493 * cur_offset == drop_end). 2512 * cur_offset == drop_end).
@@ -2541,17 +2560,61 @@ out_only_mutex:
2541 return err; 2560 return err;
2542} 2561}
2543 2562
2563/* Helper structure to record which range is already reserved */
2564struct falloc_range {
2565 struct list_head list;
2566 u64 start;
2567 u64 len;
2568};
2569
2570/*
2571 * Helper function to add falloc range
2572 *
2573 * Caller should have locked the larger range of extent containing
2574 * [start, len)
2575 */
2576static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2577{
2578 struct falloc_range *prev = NULL;
2579 struct falloc_range *range = NULL;
2580
2581 if (list_empty(head))
2582 goto insert;
2583
2584 /*
2585 * As fallocate iterate by bytenr order, we only need to check
2586 * the last range.
2587 */
2588 prev = list_entry(head->prev, struct falloc_range, list);
2589 if (prev->start + prev->len == start) {
2590 prev->len += len;
2591 return 0;
2592 }
2593insert:
2594 range = kmalloc(sizeof(*range), GFP_NOFS);
2595 if (!range)
2596 return -ENOMEM;
2597 range->start = start;
2598 range->len = len;
2599 list_add_tail(&range->list, head);
2600 return 0;
2601}
2602
2544static long btrfs_fallocate(struct file *file, int mode, 2603static long btrfs_fallocate(struct file *file, int mode,
2545 loff_t offset, loff_t len) 2604 loff_t offset, loff_t len)
2546{ 2605{
2547 struct inode *inode = file_inode(file); 2606 struct inode *inode = file_inode(file);
2548 struct extent_state *cached_state = NULL; 2607 struct extent_state *cached_state = NULL;
2608 struct falloc_range *range;
2609 struct falloc_range *tmp;
2610 struct list_head reserve_list;
2549 u64 cur_offset; 2611 u64 cur_offset;
2550 u64 last_byte; 2612 u64 last_byte;
2551 u64 alloc_start; 2613 u64 alloc_start;
2552 u64 alloc_end; 2614 u64 alloc_end;
2553 u64 alloc_hint = 0; 2615 u64 alloc_hint = 0;
2554 u64 locked_end; 2616 u64 locked_end;
2617 u64 actual_end = 0;
2555 struct extent_map *em; 2618 struct extent_map *em;
2556 int blocksize = BTRFS_I(inode)->root->sectorsize; 2619 int blocksize = BTRFS_I(inode)->root->sectorsize;
2557 int ret; 2620 int ret;
@@ -2567,11 +2630,12 @@ static long btrfs_fallocate(struct file *file, int mode,
2567 return btrfs_punch_hole(inode, offset, len); 2630 return btrfs_punch_hole(inode, offset, len);
2568 2631
2569 /* 2632 /*
2570 * Make sure we have enough space before we do the 2633 * Only trigger disk allocation, don't trigger qgroup reserve
2571 * allocation. 2634 *
2635 * For qgroup space, it will be checked later.
2572 */ 2636 */
2573 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start, alloc_end - alloc_start); 2637 ret = btrfs_alloc_data_chunk_ondemand(inode, alloc_end - alloc_start);
2574 if (ret) 2638 if (ret < 0)
2575 return ret; 2639 return ret;
2576 2640
2577 mutex_lock(&inode->i_mutex); 2641 mutex_lock(&inode->i_mutex);
@@ -2579,6 +2643,13 @@ static long btrfs_fallocate(struct file *file, int mode,
2579 if (ret) 2643 if (ret)
2580 goto out; 2644 goto out;
2581 2645
2646 /*
2647 * TODO: Move these two operations after we have checked
2648 * accurate reserved space, or fallocate can still fail but
2649 * with page truncated or size expanded.
2650 *
2651 * But that's a minor problem and won't do much harm BTW.
2652 */
2582 if (alloc_start > inode->i_size) { 2653 if (alloc_start > inode->i_size) {
2583 ret = btrfs_cont_expand(inode, i_size_read(inode), 2654 ret = btrfs_cont_expand(inode, i_size_read(inode),
2584 alloc_start); 2655 alloc_start);
@@ -2637,10 +2708,10 @@ static long btrfs_fallocate(struct file *file, int mode,
2637 } 2708 }
2638 } 2709 }
2639 2710
2711 /* First, check if we exceed the qgroup limit */
2712 INIT_LIST_HEAD(&reserve_list);
2640 cur_offset = alloc_start; 2713 cur_offset = alloc_start;
2641 while (1) { 2714 while (1) {
2642 u64 actual_end;
2643
2644 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 2715 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
2645 alloc_end - cur_offset, 0); 2716 alloc_end - cur_offset, 0);
2646 if (IS_ERR_OR_NULL(em)) { 2717 if (IS_ERR_OR_NULL(em)) {
@@ -2653,57 +2724,82 @@ static long btrfs_fallocate(struct file *file, int mode,
2653 last_byte = min(extent_map_end(em), alloc_end); 2724 last_byte = min(extent_map_end(em), alloc_end);
2654 actual_end = min_t(u64, extent_map_end(em), offset + len); 2725 actual_end = min_t(u64, extent_map_end(em), offset + len);
2655 last_byte = ALIGN(last_byte, blocksize); 2726 last_byte = ALIGN(last_byte, blocksize);
2656
2657 if (em->block_start == EXTENT_MAP_HOLE || 2727 if (em->block_start == EXTENT_MAP_HOLE ||
2658 (cur_offset >= inode->i_size && 2728 (cur_offset >= inode->i_size &&
2659 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 2729 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
2660 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 2730 ret = add_falloc_range(&reserve_list, cur_offset,
2661 last_byte - cur_offset, 2731 last_byte - cur_offset);
2662 1 << inode->i_blkbits, 2732 if (ret < 0) {
2663 offset + len, 2733 free_extent_map(em);
2664 &alloc_hint); 2734 break;
2665 } else if (actual_end > inode->i_size &&
2666 !(mode & FALLOC_FL_KEEP_SIZE)) {
2667 struct btrfs_trans_handle *trans;
2668 struct btrfs_root *root = BTRFS_I(inode)->root;
2669
2670 /*
2671 * We didn't need to allocate any more space, but we
2672 * still extended the size of the file so we need to
2673 * update i_size and the inode item.
2674 */
2675 trans = btrfs_start_transaction(root, 1);
2676 if (IS_ERR(trans)) {
2677 ret = PTR_ERR(trans);
2678 } else {
2679 inode->i_ctime = CURRENT_TIME;
2680 i_size_write(inode, actual_end);
2681 btrfs_ordered_update_i_size(inode, actual_end,
2682 NULL);
2683 ret = btrfs_update_inode(trans, root, inode);
2684 if (ret)
2685 btrfs_end_transaction(trans, root);
2686 else
2687 ret = btrfs_end_transaction(trans,
2688 root);
2689 } 2735 }
2736 ret = btrfs_qgroup_reserve_data(inode, cur_offset,
2737 last_byte - cur_offset);
2738 if (ret < 0)
2739 break;
2690 } 2740 }
2691 free_extent_map(em); 2741 free_extent_map(em);
2692 if (ret < 0)
2693 break;
2694
2695 cur_offset = last_byte; 2742 cur_offset = last_byte;
2696 if (cur_offset >= alloc_end) { 2743 if (cur_offset >= alloc_end)
2697 ret = 0;
2698 break; 2744 break;
2745 }
2746
2747 /*
2748 * If ret is still 0, means we're OK to fallocate.
2749 * Or just cleanup the list and exit.
2750 */
2751 list_for_each_entry_safe(range, tmp, &reserve_list, list) {
2752 if (!ret)
2753 ret = btrfs_prealloc_file_range(inode, mode,
2754 range->start,
2755 range->len, 1 << inode->i_blkbits,
2756 offset + len, &alloc_hint);
2757 list_del(&range->list);
2758 kfree(range);
2759 }
2760 if (ret < 0)
2761 goto out_unlock;
2762
2763 if (actual_end > inode->i_size &&
2764 !(mode & FALLOC_FL_KEEP_SIZE)) {
2765 struct btrfs_trans_handle *trans;
2766 struct btrfs_root *root = BTRFS_I(inode)->root;
2767
2768 /*
2769 * We didn't need to allocate any more space, but we
2770 * still extended the size of the file so we need to
2771 * update i_size and the inode item.
2772 */
2773 trans = btrfs_start_transaction(root, 1);
2774 if (IS_ERR(trans)) {
2775 ret = PTR_ERR(trans);
2776 } else {
2777 inode->i_ctime = CURRENT_TIME;
2778 i_size_write(inode, actual_end);
2779 btrfs_ordered_update_i_size(inode, actual_end, NULL);
2780 ret = btrfs_update_inode(trans, root, inode);
2781 if (ret)
2782 btrfs_end_transaction(trans, root);
2783 else
2784 ret = btrfs_end_transaction(trans, root);
2699 } 2785 }
2700 } 2786 }
2787out_unlock:
2701 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 2788 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
2702 &cached_state, GFP_NOFS); 2789 &cached_state, GFP_NOFS);
2703out: 2790out:
2791 /*
2792 * As we waited the extent range, the data_rsv_map must be empty
2793 * in the range, as written data range will be released from it.
2794 * And for prealloacted extent, it will also be released when
2795 * its metadata is written.
2796 * So this is completely used as cleanup.
2797 */
2798 btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
2704 mutex_unlock(&inode->i_mutex); 2799 mutex_unlock(&inode->i_mutex);
2705 /* Let go of our reservation. */ 2800 /* Let go of our reservation. */
2706 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 2801 btrfs_free_reserved_data_space(inode, alloc_start,
2802 alloc_end - alloc_start);
2707 return ret; 2803 return ret;
2708} 2804}
2709 2805
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abe3a66bd3ba..0948d34cb84a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -450,9 +450,9 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation)
450 450
451 gen = io_ctl->cur; 451 gen = io_ctl->cur;
452 if (le64_to_cpu(*gen) != generation) { 452 if (le64_to_cpu(*gen) != generation) {
453 printk_ratelimited(KERN_ERR "BTRFS: space cache generation " 453 btrfs_err_rl(io_ctl->root->fs_info,
454 "(%Lu) does not match inode (%Lu)\n", *gen, 454 "space cache generation (%llu) does not match inode (%llu)",
455 generation); 455 *gen, generation);
456 io_ctl_unmap_page(io_ctl); 456 io_ctl_unmap_page(io_ctl);
457 return -EIO; 457 return -EIO;
458 } 458 }
@@ -506,8 +506,8 @@ static int io_ctl_check_crc(struct btrfs_io_ctl *io_ctl, int index)
506 PAGE_CACHE_SIZE - offset); 506 PAGE_CACHE_SIZE - offset);
507 btrfs_csum_final(crc, (char *)&crc); 507 btrfs_csum_final(crc, (char *)&crc);
508 if (val != crc) { 508 if (val != crc) {
509 printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free " 509 btrfs_err_rl(io_ctl->root->fs_info,
510 "space cache\n"); 510 "csum mismatch on free space cache");
511 io_ctl_unmap_page(io_ctl); 511 io_ctl_unmap_page(io_ctl);
512 return -EIO; 512 return -EIO;
513 } 513 }
@@ -1215,7 +1215,7 @@ out:
1215 * @offset - the offset for the key we'll insert 1215 * @offset - the offset for the key we'll insert
1216 * 1216 *
1217 * This function writes out a free space cache struct to disk for quick recovery 1217 * This function writes out a free space cache struct to disk for quick recovery
1218 * on mount. This will return 0 if it was successfull in writing the cache out, 1218 * on mount. This will return 0 if it was successful in writing the cache out,
1219 * or an errno if it was not. 1219 * or an errno if it was not.
1220 */ 1220 */
1221static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 1221static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
@@ -1730,7 +1730,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1730 */ 1730 */
1731static int search_bitmap(struct btrfs_free_space_ctl *ctl, 1731static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1732 struct btrfs_free_space *bitmap_info, u64 *offset, 1732 struct btrfs_free_space *bitmap_info, u64 *offset,
1733 u64 *bytes) 1733 u64 *bytes, bool for_alloc)
1734{ 1734{
1735 unsigned long found_bits = 0; 1735 unsigned long found_bits = 0;
1736 unsigned long max_bits = 0; 1736 unsigned long max_bits = 0;
@@ -1738,11 +1738,26 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1738 unsigned long next_zero; 1738 unsigned long next_zero;
1739 unsigned long extent_bits; 1739 unsigned long extent_bits;
1740 1740
1741 /*
1742 * Skip searching the bitmap if we don't have a contiguous section that
1743 * is large enough for this allocation.
1744 */
1745 if (for_alloc &&
1746 bitmap_info->max_extent_size &&
1747 bitmap_info->max_extent_size < *bytes) {
1748 *bytes = bitmap_info->max_extent_size;
1749 return -1;
1750 }
1751
1741 i = offset_to_bit(bitmap_info->offset, ctl->unit, 1752 i = offset_to_bit(bitmap_info->offset, ctl->unit,
1742 max_t(u64, *offset, bitmap_info->offset)); 1753 max_t(u64, *offset, bitmap_info->offset));
1743 bits = bytes_to_bits(*bytes, ctl->unit); 1754 bits = bytes_to_bits(*bytes, ctl->unit);
1744 1755
1745 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) { 1756 for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1757 if (for_alloc && bits == 1) {
1758 found_bits = 1;
1759 break;
1760 }
1746 next_zero = find_next_zero_bit(bitmap_info->bitmap, 1761 next_zero = find_next_zero_bit(bitmap_info->bitmap,
1747 BITS_PER_BITMAP, i); 1762 BITS_PER_BITMAP, i);
1748 extent_bits = next_zero - i; 1763 extent_bits = next_zero - i;
@@ -1762,6 +1777,7 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1762 } 1777 }
1763 1778
1764 *bytes = (u64)(max_bits) * ctl->unit; 1779 *bytes = (u64)(max_bits) * ctl->unit;
1780 bitmap_info->max_extent_size = *bytes;
1765 return -1; 1781 return -1;
1766} 1782}
1767 1783
@@ -1813,7 +1829,7 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1813 if (entry->bitmap) { 1829 if (entry->bitmap) {
1814 u64 size = *bytes; 1830 u64 size = *bytes;
1815 1831
1816 ret = search_bitmap(ctl, entry, &tmp, &size); 1832 ret = search_bitmap(ctl, entry, &tmp, &size, true);
1817 if (!ret) { 1833 if (!ret) {
1818 *offset = tmp; 1834 *offset = tmp;
1819 *bytes = size; 1835 *bytes = size;
@@ -1874,7 +1890,8 @@ again:
1874 search_start = *offset; 1890 search_start = *offset;
1875 search_bytes = ctl->unit; 1891 search_bytes = ctl->unit;
1876 search_bytes = min(search_bytes, end - search_start + 1); 1892 search_bytes = min(search_bytes, end - search_start + 1);
1877 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes); 1893 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes,
1894 false);
1878 if (ret < 0 || search_start != *offset) 1895 if (ret < 0 || search_start != *offset)
1879 return -EINVAL; 1896 return -EINVAL;
1880 1897
@@ -1919,7 +1936,7 @@ again:
1919 search_start = *offset; 1936 search_start = *offset;
1920 search_bytes = ctl->unit; 1937 search_bytes = ctl->unit;
1921 ret = search_bitmap(ctl, bitmap_info, &search_start, 1938 ret = search_bitmap(ctl, bitmap_info, &search_start,
1922 &search_bytes); 1939 &search_bytes, false);
1923 if (ret < 0 || search_start != *offset) 1940 if (ret < 0 || search_start != *offset)
1924 return -EAGAIN; 1941 return -EAGAIN;
1925 1942
@@ -1943,6 +1960,12 @@ static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1943 1960
1944 bitmap_set_bits(ctl, info, offset, bytes_to_set); 1961 bitmap_set_bits(ctl, info, offset, bytes_to_set);
1945 1962
1963 /*
1964 * We set some bytes, we have no idea what the max extent size is
1965 * anymore.
1966 */
1967 info->max_extent_size = 0;
1968
1946 return bytes_to_set; 1969 return bytes_to_set;
1947 1970
1948} 1971}
@@ -1951,12 +1974,19 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1951 struct btrfs_free_space *info) 1974 struct btrfs_free_space *info)
1952{ 1975{
1953 struct btrfs_block_group_cache *block_group = ctl->private; 1976 struct btrfs_block_group_cache *block_group = ctl->private;
1977 bool forced = false;
1978
1979#ifdef CONFIG_BTRFS_DEBUG
1980 if (btrfs_should_fragment_free_space(block_group->fs_info->extent_root,
1981 block_group))
1982 forced = true;
1983#endif
1954 1984
1955 /* 1985 /*
1956 * If we are below the extents threshold then we can add this as an 1986 * If we are below the extents threshold then we can add this as an
1957 * extent, and don't have to deal with the bitmap 1987 * extent, and don't have to deal with the bitmap
1958 */ 1988 */
1959 if (ctl->free_extents < ctl->extents_thresh) { 1989 if (!forced && ctl->free_extents < ctl->extents_thresh) {
1960 /* 1990 /*
1961 * If this block group has some small extents we don't want to 1991 * If this block group has some small extents we don't want to
1962 * use up all of our free slots in the cache with them, we want 1992 * use up all of our free slots in the cache with them, we want
@@ -2661,7 +2691,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2661 search_start = min_start; 2691 search_start = min_start;
2662 search_bytes = bytes; 2692 search_bytes = bytes;
2663 2693
2664 err = search_bitmap(ctl, entry, &search_start, &search_bytes); 2694 err = search_bitmap(ctl, entry, &search_start, &search_bytes, true);
2665 if (err) { 2695 if (err) {
2666 if (search_bytes > *max_extent_size) 2696 if (search_bytes > *max_extent_size)
2667 *max_extent_size = search_bytes; 2697 *max_extent_size = search_bytes;
@@ -2775,6 +2805,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2775 unsigned long want_bits; 2805 unsigned long want_bits;
2776 unsigned long min_bits; 2806 unsigned long min_bits;
2777 unsigned long found_bits; 2807 unsigned long found_bits;
2808 unsigned long max_bits = 0;
2778 unsigned long start = 0; 2809 unsigned long start = 0;
2779 unsigned long total_found = 0; 2810 unsigned long total_found = 0;
2780 int ret; 2811 int ret;
@@ -2784,6 +2815,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2784 want_bits = bytes_to_bits(bytes, ctl->unit); 2815 want_bits = bytes_to_bits(bytes, ctl->unit);
2785 min_bits = bytes_to_bits(min_bytes, ctl->unit); 2816 min_bits = bytes_to_bits(min_bytes, ctl->unit);
2786 2817
2818 /*
2819 * Don't bother looking for a cluster in this bitmap if it's heavily
2820 * fragmented.
2821 */
2822 if (entry->max_extent_size &&
2823 entry->max_extent_size < cont1_bytes)
2824 return -ENOSPC;
2787again: 2825again:
2788 found_bits = 0; 2826 found_bits = 0;
2789 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) { 2827 for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
@@ -2791,13 +2829,19 @@ again:
2791 BITS_PER_BITMAP, i); 2829 BITS_PER_BITMAP, i);
2792 if (next_zero - i >= min_bits) { 2830 if (next_zero - i >= min_bits) {
2793 found_bits = next_zero - i; 2831 found_bits = next_zero - i;
2832 if (found_bits > max_bits)
2833 max_bits = found_bits;
2794 break; 2834 break;
2795 } 2835 }
2836 if (next_zero - i > max_bits)
2837 max_bits = next_zero - i;
2796 i = next_zero; 2838 i = next_zero;
2797 } 2839 }
2798 2840
2799 if (!found_bits) 2841 if (!found_bits) {
2842 entry->max_extent_size = (u64)max_bits * ctl->unit;
2800 return -ENOSPC; 2843 return -ENOSPC;
2844 }
2801 2845
2802 if (!total_found) { 2846 if (!total_found) {
2803 start = i; 2847 start = i;
@@ -3056,6 +3100,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
3056 spin_lock_init(&cluster->refill_lock); 3100 spin_lock_init(&cluster->refill_lock);
3057 cluster->root = RB_ROOT; 3101 cluster->root = RB_ROOT;
3058 cluster->max_size = 0; 3102 cluster->max_size = 0;
3103 cluster->fragmented = false;
3059 INIT_LIST_HEAD(&cluster->block_group_list); 3104 INIT_LIST_HEAD(&cluster->block_group_list);
3060 cluster->block_group = NULL; 3105 cluster->block_group = NULL;
3061} 3106}
@@ -3223,7 +3268,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
3223 } 3268 }
3224 3269
3225 bytes = minlen; 3270 bytes = minlen;
3226 ret2 = search_bitmap(ctl, entry, &start, &bytes); 3271 ret2 = search_bitmap(ctl, entry, &start, &bytes, false);
3227 if (ret2 || start >= end) { 3272 if (ret2 || start >= end) {
3228 spin_unlock(&ctl->tree_lock); 3273 spin_unlock(&ctl->tree_lock);
3229 mutex_unlock(&ctl->cache_writeout_mutex); 3274 mutex_unlock(&ctl->cache_writeout_mutex);
@@ -3376,7 +3421,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
3376 u64 count = 1; 3421 u64 count = 1;
3377 int ret; 3422 int ret;
3378 3423
3379 ret = search_bitmap(ctl, entry, &offset, &count); 3424 ret = search_bitmap(ctl, entry, &offset, &count, true);
3380 /* Logic error; Should be empty if it can't find anything */ 3425 /* Logic error; Should be empty if it can't find anything */
3381 ASSERT(!ret); 3426 ASSERT(!ret);
3382 3427
@@ -3532,6 +3577,7 @@ again:
3532 spin_lock(&ctl->tree_lock); 3577 spin_lock(&ctl->tree_lock);
3533 info->offset = offset; 3578 info->offset = offset;
3534 info->bytes = bytes; 3579 info->bytes = bytes;
3580 info->max_extent_size = 0;
3535 ret = link_free_space(ctl, info); 3581 ret = link_free_space(ctl, info);
3536 spin_unlock(&ctl->tree_lock); 3582 spin_unlock(&ctl->tree_lock);
3537 if (ret) 3583 if (ret)
@@ -3559,6 +3605,7 @@ again:
3559 } 3605 }
3560 3606
3561 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); 3607 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
3608
3562 bytes -= bytes_added; 3609 bytes -= bytes_added;
3563 offset += bytes_added; 3610 offset += bytes_added;
3564 spin_unlock(&ctl->tree_lock); 3611 spin_unlock(&ctl->tree_lock);
@@ -3602,7 +3649,7 @@ have_info:
3602 3649
3603 bit_off = offset; 3650 bit_off = offset;
3604 bit_bytes = ctl->unit; 3651 bit_bytes = ctl->unit;
3605 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes); 3652 ret = search_bitmap(ctl, info, &bit_off, &bit_bytes, false);
3606 if (!ret) { 3653 if (!ret) {
3607 if (bit_off == offset) { 3654 if (bit_off == offset) {
3608 ret = 1; 3655 ret = 1;
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index a16a029ad3b1..f251865eb6f3 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -23,6 +23,7 @@ struct btrfs_free_space {
23 struct rb_node offset_index; 23 struct rb_node offset_index;
24 u64 offset; 24 u64 offset;
25 u64 bytes; 25 u64 bytes;
26 u64 max_extent_size;
26 unsigned long *bitmap; 27 unsigned long *bitmap;
27 struct list_head list; 28 struct list_head list;
28}; 29};
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 265e03c73f4d..be4d22a5022f 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
157 */ 157 */
158 if (!btrfs_find_name_in_ext_backref(path, ref_objectid, 158 if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
159 name, name_len, &extref)) { 159 name, name_len, &extref)) {
160 btrfs_std_error(root->fs_info, -ENOENT); 160 btrfs_std_error(root->fs_info, -ENOENT, NULL);
161 ret = -EROFS; 161 ret = -EROFS;
162 goto out; 162 goto out;
163 } 163 }
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index d4a582ac3f73..767a6056ac45 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -488,17 +488,17 @@ again:
488 /* Just to make sure we have enough space */ 488 /* Just to make sure we have enough space */
489 prealloc += 8 * PAGE_CACHE_SIZE; 489 prealloc += 8 * PAGE_CACHE_SIZE;
490 490
491 ret = btrfs_delalloc_reserve_space(inode, prealloc); 491 ret = btrfs_delalloc_reserve_space(inode, 0, prealloc);
492 if (ret) 492 if (ret)
493 goto out_put; 493 goto out_put;
494 494
495 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 495 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
496 prealloc, prealloc, &alloc_hint); 496 prealloc, prealloc, &alloc_hint);
497 if (ret) { 497 if (ret) {
498 btrfs_delalloc_release_space(inode, prealloc); 498 btrfs_delalloc_release_space(inode, 0, prealloc);
499 goto out_put; 499 goto out_put;
500 } 500 }
501 btrfs_free_reserved_data_space(inode, prealloc); 501 btrfs_free_reserved_data_space(inode, 0, prealloc);
502 502
503 ret = btrfs_write_out_ino_cache(root, trans, path, inode); 503 ret = btrfs_write_out_ino_cache(root, trans, path, inode);
504out_put: 504out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 611b66d73e80..4439fbb4ff45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -310,6 +310,13 @@ static noinline int cow_file_range_inline(struct btrfs_root *root,
310 btrfs_delalloc_release_metadata(inode, end + 1 - start); 310 btrfs_delalloc_release_metadata(inode, end + 1 - start);
311 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 311 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
312out: 312out:
313 /*
314 * Don't forget to free the reserved space, as for inlined extent
315 * it won't count as data extent, free them directly here.
316 * And at reserve time, it's always aligned to page size, so
317 * just free one page here.
318 */
319 btrfs_qgroup_free_data(inode, 0, PAGE_CACHE_SIZE);
313 btrfs_free_path(path); 320 btrfs_free_path(path);
314 btrfs_end_transaction(trans, root); 321 btrfs_end_transaction(trans, root);
315 return ret; 322 return ret;
@@ -1096,6 +1103,9 @@ static noinline void async_cow_submit(struct btrfs_work *work)
1096 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> 1103 nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1097 PAGE_CACHE_SHIFT; 1104 PAGE_CACHE_SHIFT;
1098 1105
1106 /*
1107 * atomic_sub_return implies a barrier for waitqueue_active
1108 */
1099 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < 1109 if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1100 5 * 1024 * 1024 && 1110 5 * 1024 * 1024 &&
1101 waitqueue_active(&root->fs_info->async_submit_wait)) 1111 waitqueue_active(&root->fs_info->async_submit_wait))
@@ -1766,7 +1776,8 @@ static void btrfs_clear_bit_hook(struct inode *inode,
1766 1776
1767 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID 1777 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1768 && do_list && !(state->state & EXTENT_NORESERVE)) 1778 && do_list && !(state->state & EXTENT_NORESERVE))
1769 btrfs_free_reserved_data_space(inode, len); 1779 btrfs_free_reserved_data_space_noquota(inode,
1780 state->start, len);
1770 1781
1771 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len, 1782 __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1772 root->fs_info->delalloc_batch); 1783 root->fs_info->delalloc_batch);
@@ -1861,15 +1872,15 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1861 u64 bio_offset) 1872 u64 bio_offset)
1862{ 1873{
1863 struct btrfs_root *root = BTRFS_I(inode)->root; 1874 struct btrfs_root *root = BTRFS_I(inode)->root;
1875 enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1864 int ret = 0; 1876 int ret = 0;
1865 int skip_sum; 1877 int skip_sum;
1866 int metadata = 0;
1867 int async = !atomic_read(&BTRFS_I(inode)->sync_writers); 1878 int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1868 1879
1869 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1880 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1870 1881
1871 if (btrfs_is_free_space_inode(inode)) 1882 if (btrfs_is_free_space_inode(inode))
1872 metadata = 2; 1883 metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1873 1884
1874 if (!(rw & REQ_WRITE)) { 1885 if (!(rw & REQ_WRITE)) {
1875 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata); 1886 ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
@@ -1989,7 +2000,8 @@ again:
1989 goto again; 2000 goto again;
1990 } 2001 }
1991 2002
1992 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 2003 ret = btrfs_delalloc_reserve_space(inode, page_start,
2004 PAGE_CACHE_SIZE);
1993 if (ret) { 2005 if (ret) {
1994 mapping_set_error(page->mapping, ret); 2006 mapping_set_error(page->mapping, ret);
1995 end_extent_writepage(page, ret, page_start, page_end); 2007 end_extent_writepage(page, ret, page_start, page_end);
@@ -2115,7 +2127,13 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2115 ins.type = BTRFS_EXTENT_ITEM_KEY; 2127 ins.type = BTRFS_EXTENT_ITEM_KEY;
2116 ret = btrfs_alloc_reserved_file_extent(trans, root, 2128 ret = btrfs_alloc_reserved_file_extent(trans, root,
2117 root->root_key.objectid, 2129 root->root_key.objectid,
2118 btrfs_ino(inode), file_pos, &ins); 2130 btrfs_ino(inode), file_pos,
2131 ram_bytes, &ins);
2132 /*
2133 * Release the reserved range from inode dirty range map, as it is
2134 * already moved into delayed_ref_head
2135 */
2136 btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2119out: 2137out:
2120 btrfs_free_path(path); 2138 btrfs_free_path(path);
2121 2139
@@ -2573,7 +2591,7 @@ again:
2573 ret = btrfs_inc_extent_ref(trans, root, new->bytenr, 2591 ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2574 new->disk_len, 0, 2592 new->disk_len, 0,
2575 backref->root_id, backref->inum, 2593 backref->root_id, backref->inum,
2576 new->file_pos, 0); /* start - extent_offset */ 2594 new->file_pos); /* start - extent_offset */
2577 if (ret) { 2595 if (ret) {
2578 btrfs_abort_transaction(trans, root, ret); 2596 btrfs_abort_transaction(trans, root, ret);
2579 goto out_free_path; 2597 goto out_free_path;
@@ -2599,7 +2617,6 @@ static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2599 return; 2617 return;
2600 2618
2601 list_for_each_entry_safe(old, tmp, &new->head, list) { 2619 list_for_each_entry_safe(old, tmp, &new->head, list) {
2602 list_del(&old->list);
2603 kfree(old); 2620 kfree(old);
2604 } 2621 }
2605 kfree(new); 2622 kfree(new);
@@ -2824,6 +2841,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2824 2841
2825 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 2842 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2826 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */ 2843 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2844
2845 /*
2846 * For mwrite(mmap + memset to write) case, we still reserve
2847 * space for NOCOW range.
2848 * As NOCOW won't cause a new delayed ref, just free the space
2849 */
2850 btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
2851 ordered_extent->len);
2827 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 2852 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2828 if (nolock) 2853 if (nolock)
2829 trans = btrfs_join_transaction_nolock(root); 2854 trans = btrfs_join_transaction_nolock(root);
@@ -3018,8 +3043,6 @@ static int __readpage_endio_check(struct inode *inode,
3018 char *kaddr; 3043 char *kaddr;
3019 u32 csum_expected; 3044 u32 csum_expected;
3020 u32 csum = ~(u32)0; 3045 u32 csum = ~(u32)0;
3021 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
3022 DEFAULT_RATELIMIT_BURST);
3023 3046
3024 csum_expected = *(((u32 *)io_bio->csum) + icsum); 3047 csum_expected = *(((u32 *)io_bio->csum) + icsum);
3025 3048
@@ -3032,9 +3055,8 @@ static int __readpage_endio_check(struct inode *inode,
3032 kunmap_atomic(kaddr); 3055 kunmap_atomic(kaddr);
3033 return 0; 3056 return 0;
3034zeroit: 3057zeroit:
3035 if (__ratelimit(&_rs)) 3058 btrfs_warn_rl(BTRFS_I(inode)->root->fs_info,
3036 btrfs_warn(BTRFS_I(inode)->root->fs_info, 3059 "csum failed ino %llu off %llu csum %u expected csum %u",
3037 "csum failed ino %llu off %llu csum %u expected csum %u",
3038 btrfs_ino(inode), start, csum, csum_expected); 3060 btrfs_ino(inode), start, csum, csum_expected);
3039 memset(kaddr + pgoff, 1, len); 3061 memset(kaddr + pgoff, 1, len);
3040 flush_dcache_page(page); 3062 flush_dcache_page(page);
@@ -4217,6 +4239,47 @@ static int truncate_space_check(struct btrfs_trans_handle *trans,
4217 4239
4218} 4240}
4219 4241
4242static int truncate_inline_extent(struct inode *inode,
4243 struct btrfs_path *path,
4244 struct btrfs_key *found_key,
4245 const u64 item_end,
4246 const u64 new_size)
4247{
4248 struct extent_buffer *leaf = path->nodes[0];
4249 int slot = path->slots[0];
4250 struct btrfs_file_extent_item *fi;
4251 u32 size = (u32)(new_size - found_key->offset);
4252 struct btrfs_root *root = BTRFS_I(inode)->root;
4253
4254 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4255
4256 if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
4257 loff_t offset = new_size;
4258 loff_t page_end = ALIGN(offset, PAGE_CACHE_SIZE);
4259
4260 /*
4261 * Zero out the remaining of the last page of our inline extent,
4262 * instead of directly truncating our inline extent here - that
4263 * would be much more complex (decompressing all the data, then
4264 * compressing the truncated data, which might be bigger than
4265 * the size of the inline extent, resize the extent, etc).
4266 * We release the path because to get the page we might need to
4267 * read the extent item from disk (data not in the page cache).
4268 */
4269 btrfs_release_path(path);
4270 return btrfs_truncate_page(inode, offset, page_end - offset, 0);
4271 }
4272
4273 btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4274 size = btrfs_file_extent_calc_inline_size(size);
4275 btrfs_truncate_item(root, path, size, 1);
4276
4277 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4278 inode_sub_bytes(inode, item_end + 1 - new_size);
4279
4280 return 0;
4281}
4282
4220/* 4283/*
4221 * this can truncate away extent items, csum items and directory items. 4284 * this can truncate away extent items, csum items and directory items.
4222 * It starts at a high offset and removes keys until it can't find 4285 * It starts at a high offset and removes keys until it can't find
@@ -4411,27 +4474,40 @@ search_again:
4411 * special encodings 4474 * special encodings
4412 */ 4475 */
4413 if (!del_item && 4476 if (!del_item &&
4414 btrfs_file_extent_compression(leaf, fi) == 0 &&
4415 btrfs_file_extent_encryption(leaf, fi) == 0 && 4477 btrfs_file_extent_encryption(leaf, fi) == 0 &&
4416 btrfs_file_extent_other_encoding(leaf, fi) == 0) { 4478 btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4417 u32 size = new_size - found_key.offset;
4418
4419 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4420 inode_sub_bytes(inode, item_end + 1 -
4421 new_size);
4422 4479
4423 /* 4480 /*
4424 * update the ram bytes to properly reflect 4481 * Need to release path in order to truncate a
4425 * the new size of our item 4482 * compressed extent. So delete any accumulated
4483 * extent items so far.
4426 */ 4484 */
4427 btrfs_set_file_extent_ram_bytes(leaf, fi, size); 4485 if (btrfs_file_extent_compression(leaf, fi) !=
4428 size = 4486 BTRFS_COMPRESS_NONE && pending_del_nr) {
4429 btrfs_file_extent_calc_inline_size(size); 4487 err = btrfs_del_items(trans, root, path,
4430 btrfs_truncate_item(root, path, size, 1); 4488 pending_del_slot,
4489 pending_del_nr);
4490 if (err) {
4491 btrfs_abort_transaction(trans,
4492 root,
4493 err);
4494 goto error;
4495 }
4496 pending_del_nr = 0;
4497 }
4498
4499 err = truncate_inline_extent(inode, path,
4500 &found_key,
4501 item_end,
4502 new_size);
4503 if (err) {
4504 btrfs_abort_transaction(trans,
4505 root, err);
4506 goto error;
4507 }
4431 } else if (test_bit(BTRFS_ROOT_REF_COWS, 4508 } else if (test_bit(BTRFS_ROOT_REF_COWS,
4432 &root->state)) { 4509 &root->state)) {
4433 inode_sub_bytes(inode, item_end + 1 - 4510 inode_sub_bytes(inode, item_end + 1 - new_size);
4434 found_key.offset);
4435 } 4511 }
4436 } 4512 }
4437delete: 4513delete:
@@ -4461,7 +4537,7 @@ delete:
4461 ret = btrfs_free_extent(trans, root, extent_start, 4537 ret = btrfs_free_extent(trans, root, extent_start,
4462 extent_num_bytes, 0, 4538 extent_num_bytes, 0,
4463 btrfs_header_owner(leaf), 4539 btrfs_header_owner(leaf),
4464 ino, extent_offset, 0); 4540 ino, extent_offset);
4465 BUG_ON(ret); 4541 BUG_ON(ret);
4466 if (btrfs_should_throttle_delayed_refs(trans, root)) 4542 if (btrfs_should_throttle_delayed_refs(trans, root))
4467 btrfs_async_run_delayed_refs(root, 4543 btrfs_async_run_delayed_refs(root,
@@ -4575,14 +4651,17 @@ int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4575 if ((offset & (blocksize - 1)) == 0 && 4651 if ((offset & (blocksize - 1)) == 0 &&
4576 (!len || ((len & (blocksize - 1)) == 0))) 4652 (!len || ((len & (blocksize - 1)) == 0)))
4577 goto out; 4653 goto out;
4578 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 4654 ret = btrfs_delalloc_reserve_space(inode,
4655 round_down(from, PAGE_CACHE_SIZE), PAGE_CACHE_SIZE);
4579 if (ret) 4656 if (ret)
4580 goto out; 4657 goto out;
4581 4658
4582again: 4659again:
4583 page = find_or_create_page(mapping, index, mask); 4660 page = find_or_create_page(mapping, index, mask);
4584 if (!page) { 4661 if (!page) {
4585 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 4662 btrfs_delalloc_release_space(inode,
4663 round_down(from, PAGE_CACHE_SIZE),
4664 PAGE_CACHE_SIZE);
4586 ret = -ENOMEM; 4665 ret = -ENOMEM;
4587 goto out; 4666 goto out;
4588 } 4667 }
@@ -4650,7 +4729,8 @@ again:
4650 4729
4651out_unlock: 4730out_unlock:
4652 if (ret) 4731 if (ret)
4653 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 4732 btrfs_delalloc_release_space(inode, page_start,
4733 PAGE_CACHE_SIZE);
4654 unlock_page(page); 4734 unlock_page(page);
4655 page_cache_release(page); 4735 page_cache_release(page);
4656out: 4736out:
@@ -5048,6 +5128,18 @@ static void evict_inode_truncate_pages(struct inode *inode)
5048 spin_unlock(&io_tree->lock); 5128 spin_unlock(&io_tree->lock);
5049 5129
5050 lock_extent_bits(io_tree, start, end, 0, &cached_state); 5130 lock_extent_bits(io_tree, start, end, 0, &cached_state);
5131
5132 /*
5133 * If still has DELALLOC flag, the extent didn't reach disk,
5134 * and its reserved space won't be freed by delayed_ref.
5135 * So we need to free its reserved space here.
5136 * (Refer to comment in btrfs_invalidatepage, case 2)
5137 *
5138 * Note, end is the bytenr of last byte, so we need + 1 here.
5139 */
5140 if (state->state & EXTENT_DELALLOC)
5141 btrfs_qgroup_free_data(inode, start, end - start + 1);
5142
5051 clear_extent_bit(io_tree, start, end, 5143 clear_extent_bit(io_tree, start, end,
5052 EXTENT_LOCKED | EXTENT_DIRTY | 5144 EXTENT_LOCKED | EXTENT_DIRTY |
5053 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | 5145 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
@@ -7581,7 +7673,7 @@ unlock:
7581 spin_unlock(&BTRFS_I(inode)->lock); 7673 spin_unlock(&BTRFS_I(inode)->lock);
7582 } 7674 }
7583 7675
7584 btrfs_free_reserved_data_space(inode, len); 7676 btrfs_free_reserved_data_space(inode, start, len);
7585 WARN_ON(dio_data->reserve < len); 7677 WARN_ON(dio_data->reserve < len);
7586 dio_data->reserve -= len; 7678 dio_data->reserve -= len;
7587 current->journal_info = dio_data; 7679 current->journal_info = dio_data;
@@ -8371,7 +8463,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
8371 mutex_unlock(&inode->i_mutex); 8463 mutex_unlock(&inode->i_mutex);
8372 relock = true; 8464 relock = true;
8373 } 8465 }
8374 ret = btrfs_delalloc_reserve_space(inode, count); 8466 ret = btrfs_delalloc_reserve_space(inode, offset, count);
8375 if (ret) 8467 if (ret)
8376 goto out; 8468 goto out;
8377 dio_data.outstanding_extents = div64_u64(count + 8469 dio_data.outstanding_extents = div64_u64(count +
@@ -8400,10 +8492,10 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
8400 current->journal_info = NULL; 8492 current->journal_info = NULL;
8401 if (ret < 0 && ret != -EIOCBQUEUED) { 8493 if (ret < 0 && ret != -EIOCBQUEUED) {
8402 if (dio_data.reserve) 8494 if (dio_data.reserve)
8403 btrfs_delalloc_release_space(inode, 8495 btrfs_delalloc_release_space(inode, offset,
8404 dio_data.reserve); 8496 dio_data.reserve);
8405 } else if (ret >= 0 && (size_t)ret < count) 8497 } else if (ret >= 0 && (size_t)ret < count)
8406 btrfs_delalloc_release_space(inode, 8498 btrfs_delalloc_release_space(inode, offset,
8407 count - (size_t)ret); 8499 count - (size_t)ret);
8408 } 8500 }
8409out: 8501out:
@@ -8562,6 +8654,18 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8562 } 8654 }
8563 } 8655 }
8564 8656
8657 /*
8658 * Qgroup reserved space handler
8659 * Page here will be either
8660 * 1) Already written to disk
8661 * In this case, its reserved space is released from data rsv map
8662 * and will be freed by delayed_ref handler finally.
8663 * So even we call qgroup_free_data(), it won't decrease reserved
8664 * space.
8665 * 2) Not written to disk
8666 * This means the reserved space should be freed here.
8667 */
8668 btrfs_qgroup_free_data(inode, page_start, PAGE_CACHE_SIZE);
8565 if (!inode_evicting) { 8669 if (!inode_evicting) {
8566 clear_extent_bit(tree, page_start, page_end, 8670 clear_extent_bit(tree, page_start, page_end,
8567 EXTENT_LOCKED | EXTENT_DIRTY | 8671 EXTENT_LOCKED | EXTENT_DIRTY |
@@ -8612,7 +8716,11 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
8612 u64 page_end; 8716 u64 page_end;
8613 8717
8614 sb_start_pagefault(inode->i_sb); 8718 sb_start_pagefault(inode->i_sb);
8615 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 8719 page_start = page_offset(page);
8720 page_end = page_start + PAGE_CACHE_SIZE - 1;
8721
8722 ret = btrfs_delalloc_reserve_space(inode, page_start,
8723 PAGE_CACHE_SIZE);
8616 if (!ret) { 8724 if (!ret) {
8617 ret = file_update_time(vma->vm_file); 8725 ret = file_update_time(vma->vm_file);
8618 reserved = 1; 8726 reserved = 1;
@@ -8631,8 +8739,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
8631again: 8739again:
8632 lock_page(page); 8740 lock_page(page);
8633 size = i_size_read(inode); 8741 size = i_size_read(inode);
8634 page_start = page_offset(page);
8635 page_end = page_start + PAGE_CACHE_SIZE - 1;
8636 8742
8637 if ((page->mapping != inode->i_mapping) || 8743 if ((page->mapping != inode->i_mapping) ||
8638 (page_start >= size)) { 8744 (page_start >= size)) {
@@ -8709,7 +8815,7 @@ out_unlock:
8709 } 8815 }
8710 unlock_page(page); 8816 unlock_page(page);
8711out: 8817out:
8712 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 8818 btrfs_delalloc_release_space(inode, page_start, PAGE_CACHE_SIZE);
8713out_noreserve: 8819out_noreserve:
8714 sb_end_pagefault(inode->i_sb); 8820 sb_end_pagefault(inode->i_sb);
8715 return ret; 8821 return ret;
@@ -8998,6 +9104,7 @@ void btrfs_destroy_inode(struct inode *inode)
8998 btrfs_put_ordered_extent(ordered); 9104 btrfs_put_ordered_extent(ordered);
8999 } 9105 }
9000 } 9106 }
9107 btrfs_qgroup_check_reserved_leak(inode);
9001 inode_tree_del(inode); 9108 inode_tree_del(inode);
9002 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 9109 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
9003free: 9110free:
@@ -9634,6 +9741,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9634 u64 cur_offset = start; 9741 u64 cur_offset = start;
9635 u64 i_size; 9742 u64 i_size;
9636 u64 cur_bytes; 9743 u64 cur_bytes;
9744 u64 last_alloc = (u64)-1;
9637 int ret = 0; 9745 int ret = 0;
9638 bool own_trans = true; 9746 bool own_trans = true;
9639 9747
@@ -9650,6 +9758,13 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9650 9758
9651 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); 9759 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
9652 cur_bytes = max(cur_bytes, min_size); 9760 cur_bytes = max(cur_bytes, min_size);
9761 /*
9762 * If we are severely fragmented we could end up with really
9763 * small allocations, so if the allocator is returning small
9764 * chunks lets make its job easier by only searching for those
9765 * sized chunks.
9766 */
9767 cur_bytes = min(cur_bytes, last_alloc);
9653 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0, 9768 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
9654 *alloc_hint, &ins, 1, 0); 9769 *alloc_hint, &ins, 1, 0);
9655 if (ret) { 9770 if (ret) {
@@ -9658,6 +9773,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9658 break; 9773 break;
9659 } 9774 }
9660 9775
9776 last_alloc = ins.offset;
9661 ret = insert_reserved_file_extent(trans, inode, 9777 ret = insert_reserved_file_extent(trans, inode,
9662 cur_offset, ins.objectid, 9778 cur_offset, ins.objectid,
9663 ins.offset, ins.offset, 9779 ins.offset, ins.offset,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8d20f3b1cab0..da94138eb85e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1120,7 +1120,8 @@ static int cluster_pages_for_defrag(struct inode *inode,
1120 page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1); 1120 page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
1121 1121
1122 ret = btrfs_delalloc_reserve_space(inode, 1122 ret = btrfs_delalloc_reserve_space(inode,
1123 page_cnt << PAGE_CACHE_SHIFT); 1123 start_index << PAGE_CACHE_SHIFT,
1124 page_cnt << PAGE_CACHE_SHIFT);
1124 if (ret) 1125 if (ret)
1125 return ret; 1126 return ret;
1126 i_done = 0; 1127 i_done = 0;
@@ -1210,7 +1211,8 @@ again:
1210 BTRFS_I(inode)->outstanding_extents++; 1211 BTRFS_I(inode)->outstanding_extents++;
1211 spin_unlock(&BTRFS_I(inode)->lock); 1212 spin_unlock(&BTRFS_I(inode)->lock);
1212 btrfs_delalloc_release_space(inode, 1213 btrfs_delalloc_release_space(inode,
1213 (page_cnt - i_done) << PAGE_CACHE_SHIFT); 1214 start_index << PAGE_CACHE_SHIFT,
1215 (page_cnt - i_done) << PAGE_CACHE_SHIFT);
1214 } 1216 }
1215 1217
1216 1218
@@ -1235,7 +1237,9 @@ out:
1235 unlock_page(pages[i]); 1237 unlock_page(pages[i]);
1236 page_cache_release(pages[i]); 1238 page_cache_release(pages[i]);
1237 } 1239 }
1238 btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT); 1240 btrfs_delalloc_release_space(inode,
1241 start_index << PAGE_CACHE_SHIFT,
1242 page_cnt << PAGE_CACHE_SHIFT);
1239 return ret; 1243 return ret;
1240 1244
1241} 1245}
@@ -1342,7 +1346,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1342 break; 1346 break;
1343 1347
1344 if (btrfs_defrag_cancelled(root->fs_info)) { 1348 if (btrfs_defrag_cancelled(root->fs_info)) {
1345 printk(KERN_DEBUG "BTRFS: defrag_file cancelled\n"); 1349 btrfs_debug(root->fs_info, "defrag_file cancelled");
1346 ret = -EAGAIN; 1350 ret = -EAGAIN;
1347 break; 1351 break;
1348 } 1352 }
@@ -1579,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
1579 new_size = div_u64(new_size, root->sectorsize); 1583 new_size = div_u64(new_size, root->sectorsize);
1580 new_size *= root->sectorsize; 1584 new_size *= root->sectorsize;
1581 1585
1582 printk_in_rcu(KERN_INFO "BTRFS: new size for %s is %llu\n", 1586 btrfs_info_in_rcu(root->fs_info, "new size for %s is %llu",
1583 rcu_str_deref(device->name), new_size); 1587 rcu_str_deref(device->name), new_size);
1584 1588
1585 if (new_size > old_size) { 1589 if (new_size > old_size) {
@@ -2081,7 +2085,7 @@ static noinline int search_ioctl(struct inode *inode,
2081 key.offset = (u64)-1; 2085 key.offset = (u64)-1;
2082 root = btrfs_read_fs_root_no_name(info, &key); 2086 root = btrfs_read_fs_root_no_name(info, &key);
2083 if (IS_ERR(root)) { 2087 if (IS_ERR(root)) {
2084 printk(KERN_ERR "BTRFS: could not find root %llu\n", 2088 btrfs_err(info, "could not find root %llu",
2085 sk->tree_id); 2089 sk->tree_id);
2086 btrfs_free_path(path); 2090 btrfs_free_path(path);
2087 return -ENOENT; 2091 return -ENOENT;
@@ -2221,7 +2225,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
2221 key.offset = (u64)-1; 2225 key.offset = (u64)-1;
2222 root = btrfs_read_fs_root_no_name(info, &key); 2226 root = btrfs_read_fs_root_no_name(info, &key);
2223 if (IS_ERR(root)) { 2227 if (IS_ERR(root)) {
2224 printk(KERN_ERR "BTRFS: could not find root %llu\n", tree_id); 2228 btrfs_err(info, "could not find root %llu", tree_id);
2225 ret = -ENOENT; 2229 ret = -ENOENT;
2226 goto out; 2230 goto out;
2227 } 2231 }
@@ -2699,7 +2703,6 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2699{ 2703{
2700 struct btrfs_ioctl_fs_info_args *fi_args; 2704 struct btrfs_ioctl_fs_info_args *fi_args;
2701 struct btrfs_device *device; 2705 struct btrfs_device *device;
2702 struct btrfs_device *next;
2703 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 2706 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2704 int ret = 0; 2707 int ret = 0;
2705 2708
@@ -2711,7 +2714,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2711 fi_args->num_devices = fs_devices->num_devices; 2714 fi_args->num_devices = fs_devices->num_devices;
2712 memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid)); 2715 memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
2713 2716
2714 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 2717 list_for_each_entry(device, &fs_devices->devices, dev_list) {
2715 if (device->devid > fi_args->max_id) 2718 if (device->devid > fi_args->max_id)
2716 fi_args->max_id = device->devid; 2719 fi_args->max_id = device->devid;
2717 } 2720 }
@@ -3203,41 +3206,6 @@ out:
3203 return ret; 3206 return ret;
3204} 3207}
3205 3208
3206/* Helper to check and see if this root currently has a ref on the given disk
3207 * bytenr. If it does then we need to update the quota for this root. This
3208 * doesn't do anything if quotas aren't enabled.
3209 */
3210static int check_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3211 u64 disko)
3212{
3213 struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
3214 struct ulist *roots;
3215 struct ulist_iterator uiter;
3216 struct ulist_node *root_node = NULL;
3217 int ret;
3218
3219 if (!root->fs_info->quota_enabled)
3220 return 1;
3221
3222 btrfs_get_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
3223 ret = btrfs_find_all_roots(trans, root->fs_info, disko,
3224 tree_mod_seq_elem.seq, &roots);
3225 if (ret < 0)
3226 goto out;
3227 ret = 0;
3228 ULIST_ITER_INIT(&uiter);
3229 while ((root_node = ulist_next(roots, &uiter))) {
3230 if (root_node->val == root->objectid) {
3231 ret = 1;
3232 break;
3233 }
3234 }
3235 ulist_free(roots);
3236out:
3237 btrfs_put_tree_mod_seq(root->fs_info, &tree_mod_seq_elem);
3238 return ret;
3239}
3240
3241static int clone_finish_inode_update(struct btrfs_trans_handle *trans, 3209static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
3242 struct inode *inode, 3210 struct inode *inode,
3243 u64 endoff, 3211 u64 endoff,
@@ -3328,6 +3296,150 @@ static void clone_update_extent_map(struct inode *inode,
3328 &BTRFS_I(inode)->runtime_flags); 3296 &BTRFS_I(inode)->runtime_flags);
3329} 3297}
3330 3298
3299/*
3300 * Make sure we do not end up inserting an inline extent into a file that has
3301 * already other (non-inline) extents. If a file has an inline extent it can
3302 * not have any other extents and the (single) inline extent must start at the
3303 * file offset 0. Failing to respect these rules will lead to file corruption,
3304 * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
3305 *
3306 * We can have extents that have been already written to disk or we can have
3307 * dirty ranges still in delalloc, in which case the extent maps and items are
3308 * created only when we run delalloc, and the delalloc ranges might fall outside
3309 * the range we are currently locking in the inode's io tree. So we check the
3310 * inode's i_size because of that (i_size updates are done while holding the
3311 * i_mutex, which we are holding here).
3312 * We also check to see if the inode has a size not greater than "datal" but has
3313 * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
3314 * protected against such concurrent fallocate calls by the i_mutex).
3315 *
3316 * If the file has no extents but a size greater than datal, do not allow the
3317 * copy because we would need turn the inline extent into a non-inline one (even
3318 * with NO_HOLES enabled). If we find our destination inode only has one inline
3319 * extent, just overwrite it with the source inline extent if its size is less
3320 * than the source extent's size, or we could copy the source inline extent's
3321 * data into the destination inode's inline extent if the later is greater then
3322 * the former.
3323 */
3324static int clone_copy_inline_extent(struct inode *src,
3325 struct inode *dst,
3326 struct btrfs_trans_handle *trans,
3327 struct btrfs_path *path,
3328 struct btrfs_key *new_key,
3329 const u64 drop_start,
3330 const u64 datal,
3331 const u64 skip,
3332 const u64 size,
3333 char *inline_data)
3334{
3335 struct btrfs_root *root = BTRFS_I(dst)->root;
3336 const u64 aligned_end = ALIGN(new_key->offset + datal,
3337 root->sectorsize);
3338 int ret;
3339 struct btrfs_key key;
3340
3341 if (new_key->offset > 0)
3342 return -EOPNOTSUPP;
3343
3344 key.objectid = btrfs_ino(dst);
3345 key.type = BTRFS_EXTENT_DATA_KEY;
3346 key.offset = 0;
3347 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3348 if (ret < 0) {
3349 return ret;
3350 } else if (ret > 0) {
3351 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
3352 ret = btrfs_next_leaf(root, path);
3353 if (ret < 0)
3354 return ret;
3355 else if (ret > 0)
3356 goto copy_inline_extent;
3357 }
3358 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
3359 if (key.objectid == btrfs_ino(dst) &&
3360 key.type == BTRFS_EXTENT_DATA_KEY) {
3361 ASSERT(key.offset > 0);
3362 return -EOPNOTSUPP;
3363 }
3364 } else if (i_size_read(dst) <= datal) {
3365 struct btrfs_file_extent_item *ei;
3366 u64 ext_len;
3367
3368 /*
3369 * If the file size is <= datal, make sure there are no other
3370 * extents following (can happen do to an fallocate call with
3371 * the flag FALLOC_FL_KEEP_SIZE).
3372 */
3373 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3374 struct btrfs_file_extent_item);
3375 /*
3376 * If it's an inline extent, it can not have other extents
3377 * following it.
3378 */
3379 if (btrfs_file_extent_type(path->nodes[0], ei) ==
3380 BTRFS_FILE_EXTENT_INLINE)
3381 goto copy_inline_extent;
3382
3383 ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
3384 if (ext_len > aligned_end)
3385 return -EOPNOTSUPP;
3386
3387 ret = btrfs_next_item(root, path);
3388 if (ret < 0) {
3389 return ret;
3390 } else if (ret == 0) {
3391 btrfs_item_key_to_cpu(path->nodes[0], &key,
3392 path->slots[0]);
3393 if (key.objectid == btrfs_ino(dst) &&
3394 key.type == BTRFS_EXTENT_DATA_KEY)
3395 return -EOPNOTSUPP;
3396 }
3397 }
3398
3399copy_inline_extent:
3400 /*
3401 * We have no extent items, or we have an extent at offset 0 which may
3402 * or may not be inlined. All these cases are dealt the same way.
3403 */
3404 if (i_size_read(dst) > datal) {
3405 /*
3406 * If the destination inode has an inline extent...
3407 * This would require copying the data from the source inline
3408 * extent into the beginning of the destination's inline extent.
3409 * But this is really complex, both extents can be compressed
3410 * or just one of them, which would require decompressing and
3411 * re-compressing data (which could increase the new compressed
3412 * size, not allowing the compressed data to fit anymore in an
3413 * inline extent).
3414 * So just don't support this case for now (it should be rare,
3415 * we are not really saving space when cloning inline extents).
3416 */
3417 return -EOPNOTSUPP;
3418 }
3419
3420 btrfs_release_path(path);
3421 ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
3422 if (ret)
3423 return ret;
3424 ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
3425 if (ret)
3426 return ret;
3427
3428 if (skip) {
3429 const u32 start = btrfs_file_extent_calc_inline_size(0);
3430
3431 memmove(inline_data + start, inline_data + start + skip, datal);
3432 }
3433
3434 write_extent_buffer(path->nodes[0], inline_data,
3435 btrfs_item_ptr_offset(path->nodes[0],
3436 path->slots[0]),
3437 size);
3438 inode_add_bytes(dst, datal);
3439
3440 return 0;
3441}
3442
3331/** 3443/**
3332 * btrfs_clone() - clone a range from inode file to another 3444 * btrfs_clone() - clone a range from inode file to another
3333 * 3445 *
@@ -3352,9 +3464,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
3352 u32 nritems; 3464 u32 nritems;
3353 int slot; 3465 int slot;
3354 int ret; 3466 int ret;
3355 int no_quota;
3356 const u64 len = olen_aligned; 3467 const u64 len = olen_aligned;
3357 u64 last_disko = 0;
3358 u64 last_dest_end = destoff; 3468 u64 last_dest_end = destoff;
3359 3469
3360 ret = -ENOMEM; 3470 ret = -ENOMEM;
@@ -3400,7 +3510,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
3400 3510
3401 nritems = btrfs_header_nritems(path->nodes[0]); 3511 nritems = btrfs_header_nritems(path->nodes[0]);
3402process_slot: 3512process_slot:
3403 no_quota = 1;
3404 if (path->slots[0] >= nritems) { 3513 if (path->slots[0] >= nritems) {
3405 ret = btrfs_next_leaf(BTRFS_I(src)->root, path); 3514 ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
3406 if (ret < 0) 3515 if (ret < 0)
@@ -3552,35 +3661,13 @@ process_slot:
3552 btrfs_set_file_extent_num_bytes(leaf, extent, 3661 btrfs_set_file_extent_num_bytes(leaf, extent,
3553 datal); 3662 datal);
3554 3663
3555 /*
3556 * We need to look up the roots that point at
3557 * this bytenr and see if the new root does. If
3558 * it does not we need to make sure we update
3559 * quotas appropriately.
3560 */
3561 if (disko && root != BTRFS_I(src)->root &&
3562 disko != last_disko) {
3563 no_quota = check_ref(trans, root,
3564 disko);
3565 if (no_quota < 0) {
3566 btrfs_abort_transaction(trans,
3567 root,
3568 ret);
3569 btrfs_end_transaction(trans,
3570 root);
3571 ret = no_quota;
3572 goto out;
3573 }
3574 }
3575
3576 if (disko) { 3664 if (disko) {
3577 inode_add_bytes(inode, datal); 3665 inode_add_bytes(inode, datal);
3578 ret = btrfs_inc_extent_ref(trans, root, 3666 ret = btrfs_inc_extent_ref(trans, root,
3579 disko, diskl, 0, 3667 disko, diskl, 0,
3580 root->root_key.objectid, 3668 root->root_key.objectid,
3581 btrfs_ino(inode), 3669 btrfs_ino(inode),
3582 new_key.offset - datao, 3670 new_key.offset - datao);
3583 no_quota);
3584 if (ret) { 3671 if (ret) {
3585 btrfs_abort_transaction(trans, 3672 btrfs_abort_transaction(trans,
3586 root, 3673 root,
@@ -3594,21 +3681,6 @@ process_slot:
3594 } else if (type == BTRFS_FILE_EXTENT_INLINE) { 3681 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
3595 u64 skip = 0; 3682 u64 skip = 0;
3596 u64 trim = 0; 3683 u64 trim = 0;
3597 u64 aligned_end = 0;
3598
3599 /*
3600 * Don't copy an inline extent into an offset
3601 * greater than zero. Having an inline extent
3602 * at such an offset results in chaos as btrfs
3603 * isn't prepared for such cases. Just skip
3604 * this case for the same reasons as commented
3605 * at btrfs_ioctl_clone().
3606 */
3607 if (last_dest_end > 0) {
3608 ret = -EOPNOTSUPP;
3609 btrfs_end_transaction(trans, root);
3610 goto out;
3611 }
3612 3684
3613 if (off > key.offset) { 3685 if (off > key.offset) {
3614 skip = off - key.offset; 3686 skip = off - key.offset;
@@ -3626,42 +3698,22 @@ process_slot:
3626 size -= skip + trim; 3698 size -= skip + trim;
3627 datal -= skip + trim; 3699 datal -= skip + trim;
3628 3700
3629 aligned_end = ALIGN(new_key.offset + datal, 3701 ret = clone_copy_inline_extent(src, inode,
3630 root->sectorsize); 3702 trans, path,
3631 ret = btrfs_drop_extents(trans, root, inode, 3703 &new_key,
3632 drop_start, 3704 drop_start,
3633 aligned_end, 3705 datal,
3634 1); 3706 skip, size, buf);
3635 if (ret) { 3707 if (ret) {
3636 if (ret != -EOPNOTSUPP) 3708 if (ret != -EOPNOTSUPP)
3637 btrfs_abort_transaction(trans, 3709 btrfs_abort_transaction(trans,
3638 root, ret); 3710 root,
3639 btrfs_end_transaction(trans, root); 3711 ret);
3640 goto out;
3641 }
3642
3643 ret = btrfs_insert_empty_item(trans, root, path,
3644 &new_key, size);
3645 if (ret) {
3646 btrfs_abort_transaction(trans, root,
3647 ret);
3648 btrfs_end_transaction(trans, root); 3712 btrfs_end_transaction(trans, root);
3649 goto out; 3713 goto out;
3650 } 3714 }
3651
3652 if (skip) {
3653 u32 start =
3654 btrfs_file_extent_calc_inline_size(0);
3655 memmove(buf+start, buf+start+skip,
3656 datal);
3657 }
3658
3659 leaf = path->nodes[0]; 3715 leaf = path->nodes[0];
3660 slot = path->slots[0]; 3716 slot = path->slots[0];
3661 write_extent_buffer(leaf, buf,
3662 btrfs_item_ptr_offset(leaf, slot),
3663 size);
3664 inode_add_bytes(inode, datal);
3665 } 3717 }
3666 3718
3667 /* If we have an implicit hole (NO_HOLES feature). */ 3719 /* If we have an implicit hole (NO_HOLES feature). */
@@ -4814,7 +4866,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
4814 /* update qgroup status and info */ 4866 /* update qgroup status and info */
4815 err = btrfs_run_qgroups(trans, root->fs_info); 4867 err = btrfs_run_qgroups(trans, root->fs_info);
4816 if (err < 0) 4868 if (err < 0)
4817 btrfs_error(root->fs_info, ret, 4869 btrfs_std_error(root->fs_info, ret,
4818 "failed to update qgroup status and info\n"); 4870 "failed to update qgroup status and info\n");
4819 err = btrfs_end_transaction(trans, root); 4871 err = btrfs_end_transaction(trans, root);
4820 if (err && !ret) 4872 if (err && !ret)
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index d7e6baf1b205..8077461fc56a 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -79,6 +79,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
79 write_lock(&eb->lock); 79 write_lock(&eb->lock);
80 WARN_ON(atomic_read(&eb->spinning_writers)); 80 WARN_ON(atomic_read(&eb->spinning_writers));
81 atomic_inc(&eb->spinning_writers); 81 atomic_inc(&eb->spinning_writers);
82 /*
83 * atomic_dec_and_test implies a barrier for waitqueue_active
84 */
82 if (atomic_dec_and_test(&eb->blocking_writers) && 85 if (atomic_dec_and_test(&eb->blocking_writers) &&
83 waitqueue_active(&eb->write_lock_wq)) 86 waitqueue_active(&eb->write_lock_wq))
84 wake_up(&eb->write_lock_wq); 87 wake_up(&eb->write_lock_wq);
@@ -86,6 +89,9 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
86 BUG_ON(atomic_read(&eb->blocking_readers) == 0); 89 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
87 read_lock(&eb->lock); 90 read_lock(&eb->lock);
88 atomic_inc(&eb->spinning_readers); 91 atomic_inc(&eb->spinning_readers);
92 /*
93 * atomic_dec_and_test implies a barrier for waitqueue_active
94 */
89 if (atomic_dec_and_test(&eb->blocking_readers) && 95 if (atomic_dec_and_test(&eb->blocking_readers) &&
90 waitqueue_active(&eb->read_lock_wq)) 96 waitqueue_active(&eb->read_lock_wq))
91 wake_up(&eb->read_lock_wq); 97 wake_up(&eb->read_lock_wq);
@@ -229,6 +235,9 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
229 } 235 }
230 btrfs_assert_tree_read_locked(eb); 236 btrfs_assert_tree_read_locked(eb);
231 WARN_ON(atomic_read(&eb->blocking_readers) == 0); 237 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
238 /*
239 * atomic_dec_and_test implies a barrier for waitqueue_active
240 */
232 if (atomic_dec_and_test(&eb->blocking_readers) && 241 if (atomic_dec_and_test(&eb->blocking_readers) &&
233 waitqueue_active(&eb->read_lock_wq)) 242 waitqueue_active(&eb->read_lock_wq))
234 wake_up(&eb->read_lock_wq); 243 wake_up(&eb->read_lock_wq);
@@ -280,6 +289,9 @@ void btrfs_tree_unlock(struct extent_buffer *eb)
280 if (blockers) { 289 if (blockers) {
281 WARN_ON(atomic_read(&eb->spinning_writers)); 290 WARN_ON(atomic_read(&eb->spinning_writers));
282 atomic_dec(&eb->blocking_writers); 291 atomic_dec(&eb->blocking_writers);
292 /*
293 * Make sure counter is updated before we wake up waiters.
294 */
283 smp_mb(); 295 smp_mb();
284 if (waitqueue_active(&eb->write_lock_wq)) 296 if (waitqueue_active(&eb->write_lock_wq))
285 wake_up(&eb->write_lock_wq); 297 wake_up(&eb->write_lock_wq);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52170cf1757e..8c27292ea9ea 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -345,6 +345,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
345 345
346 if (entry->bytes_left == 0) { 346 if (entry->bytes_left == 0) {
347 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 347 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
348 /*
349 * Implicit memory barrier after test_and_set_bit
350 */
348 if (waitqueue_active(&entry->wait)) 351 if (waitqueue_active(&entry->wait))
349 wake_up(&entry->wait); 352 wake_up(&entry->wait);
350 } else { 353 } else {
@@ -409,6 +412,9 @@ have_entry:
409 412
410 if (entry->bytes_left == 0) { 413 if (entry->bytes_left == 0) {
411 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); 414 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
415 /*
416 * Implicit memory barrier after test_and_set_bit
417 */
412 if (waitqueue_active(&entry->wait)) 418 if (waitqueue_active(&entry->wait))
413 wake_up(&entry->wait); 419 wake_up(&entry->wait);
414 } else { 420 } else {
@@ -484,15 +490,16 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
484 490
485 spin_lock_irq(&log->log_extents_lock[index]); 491 spin_lock_irq(&log->log_extents_lock[index]);
486 while (!list_empty(&log->logged_list[index])) { 492 while (!list_empty(&log->logged_list[index])) {
493 struct inode *inode;
487 ordered = list_first_entry(&log->logged_list[index], 494 ordered = list_first_entry(&log->logged_list[index],
488 struct btrfs_ordered_extent, 495 struct btrfs_ordered_extent,
489 log_list); 496 log_list);
490 list_del_init(&ordered->log_list); 497 list_del_init(&ordered->log_list);
498 inode = ordered->inode;
491 spin_unlock_irq(&log->log_extents_lock[index]); 499 spin_unlock_irq(&log->log_extents_lock[index]);
492 500
493 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && 501 if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) &&
494 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { 502 !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) {
495 struct inode *inode = ordered->inode;
496 u64 start = ordered->file_offset; 503 u64 start = ordered->file_offset;
497 u64 end = ordered->file_offset + ordered->len - 1; 504 u64 end = ordered->file_offset + ordered->len - 1;
498 505
@@ -503,20 +510,25 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans,
503 &ordered->flags)); 510 &ordered->flags));
504 511
505 /* 512 /*
506 * If our ordered extent completed it means it updated the 513 * In order to keep us from losing our ordered extent
507 * fs/subvol and csum trees already, so no need to make the 514 * information when committing the transaction we have to make
508 * current transaction's commit wait for it, as we end up 515 * sure that any logged extents are completed when we go to
509 * holding memory unnecessarily and delaying the inode's iput 516 * commit the transaction. To do this we simply increase the
510 * until the transaction commit (we schedule an iput for the 517 * current transactions pending_ordered counter and decrement it
511 * inode when the ordered extent's refcount drops to 0), which 518 * when the ordered extent completes.
512 * prevents it from being evictable until the transaction
513 * commits.
514 */ 519 */
515 if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) 520 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
516 btrfs_put_ordered_extent(ordered); 521 struct btrfs_ordered_inode_tree *tree;
517 else 522
518 list_add_tail(&ordered->trans_list, &trans->ordered); 523 tree = &BTRFS_I(inode)->ordered_tree;
519 524 spin_lock_irq(&tree->lock);
525 if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
526 set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
527 atomic_inc(&trans->transaction->pending_ordered);
528 }
529 spin_unlock_irq(&tree->lock);
530 }
531 btrfs_put_ordered_extent(ordered);
520 spin_lock_irq(&log->log_extents_lock[index]); 532 spin_lock_irq(&log->log_extents_lock[index]);
521 } 533 }
522 spin_unlock_irq(&log->log_extents_lock[index]); 534 spin_unlock_irq(&log->log_extents_lock[index]);
@@ -578,6 +590,7 @@ void btrfs_remove_ordered_extent(struct inode *inode,
578 struct btrfs_ordered_inode_tree *tree; 590 struct btrfs_ordered_inode_tree *tree;
579 struct btrfs_root *root = BTRFS_I(inode)->root; 591 struct btrfs_root *root = BTRFS_I(inode)->root;
580 struct rb_node *node; 592 struct rb_node *node;
593 bool dec_pending_ordered = false;
581 594
582 tree = &BTRFS_I(inode)->ordered_tree; 595 tree = &BTRFS_I(inode)->ordered_tree;
583 spin_lock_irq(&tree->lock); 596 spin_lock_irq(&tree->lock);
@@ -587,8 +600,37 @@ void btrfs_remove_ordered_extent(struct inode *inode,
587 if (tree->last == node) 600 if (tree->last == node)
588 tree->last = NULL; 601 tree->last = NULL;
589 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 602 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
603 if (test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags))
604 dec_pending_ordered = true;
590 spin_unlock_irq(&tree->lock); 605 spin_unlock_irq(&tree->lock);
591 606
607 /*
608 * The current running transaction is waiting on us, we need to let it
609 * know that we're complete and wake it up.
610 */
611 if (dec_pending_ordered) {
612 struct btrfs_transaction *trans;
613
614 /*
615 * The checks for trans are just a formality, it should be set,
616 * but if it isn't we don't want to deref/assert under the spin
617 * lock, so be nice and check if trans is set, but ASSERT() so
618 * if it isn't set a developer will notice.
619 */
620 spin_lock(&root->fs_info->trans_lock);
621 trans = root->fs_info->running_transaction;
622 if (trans)
623 atomic_inc(&trans->use_count);
624 spin_unlock(&root->fs_info->trans_lock);
625
626 ASSERT(trans);
627 if (trans) {
628 if (atomic_dec_and_test(&trans->pending_ordered))
629 wake_up(&trans->pending_wait);
630 btrfs_put_transaction(trans);
631 }
632 }
633
592 spin_lock(&root->ordered_extent_lock); 634 spin_lock(&root->ordered_extent_lock);
593 list_del_init(&entry->root_extent_list); 635 list_del_init(&entry->root_extent_list);
594 root->nr_ordered_extents--; 636 root->nr_ordered_extents--;
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 7176cc0fe43f..23c96059cef2 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -73,6 +73,8 @@ struct btrfs_ordered_sum {
73 73
74#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent 74#define BTRFS_ORDERED_LOGGED 10 /* Set when we've waited on this ordered extent
75 * in the logging code. */ 75 * in the logging code. */
76#define BTRFS_ORDERED_PENDING 11 /* We are waiting for this ordered extent to
77 * complete in the current transaction. */
76struct btrfs_ordered_extent { 78struct btrfs_ordered_extent {
77 /* logical offset in the file */ 79 /* logical offset in the file */
78 u64 file_offset; 80 u64 file_offset;
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index dca137b04095..f9e60231f685 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -49,18 +49,16 @@ static struct prop_handler prop_handlers[] = {
49 .extract = prop_compression_extract, 49 .extract = prop_compression_extract,
50 .inheritable = 1 50 .inheritable = 1
51 }, 51 },
52 {
53 .xattr_name = NULL
54 }
55}; 52};
56 53
57void __init btrfs_props_init(void) 54void __init btrfs_props_init(void)
58{ 55{
59 struct prop_handler *p; 56 int i;
60 57
61 hash_init(prop_handlers_ht); 58 hash_init(prop_handlers_ht);
62 59
63 for (p = &prop_handlers[0]; p->xattr_name; p++) { 60 for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
61 struct prop_handler *p = &prop_handlers[i];
64 u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name)); 62 u64 h = btrfs_name_hash(p->xattr_name, strlen(p->xattr_name));
65 63
66 hash_add(prop_handlers_ht, &p->node, h); 64 hash_add(prop_handlers_ht, &p->node, h);
@@ -301,15 +299,16 @@ static int inherit_props(struct btrfs_trans_handle *trans,
301 struct inode *inode, 299 struct inode *inode,
302 struct inode *parent) 300 struct inode *parent)
303{ 301{
304 const struct prop_handler *h;
305 struct btrfs_root *root = BTRFS_I(inode)->root; 302 struct btrfs_root *root = BTRFS_I(inode)->root;
306 int ret; 303 int ret;
304 int i;
307 305
308 if (!test_bit(BTRFS_INODE_HAS_PROPS, 306 if (!test_bit(BTRFS_INODE_HAS_PROPS,
309 &BTRFS_I(parent)->runtime_flags)) 307 &BTRFS_I(parent)->runtime_flags))
310 return 0; 308 return 0;
311 309
312 for (h = &prop_handlers[0]; h->xattr_name; h++) { 310 for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) {
311 const struct prop_handler *h = &prop_handlers[i];
313 const char *value; 312 const char *value;
314 u64 num_bytes; 313 u64 num_bytes;
315 314
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d904ee1c5349..46476c226395 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1652,10 +1652,6 @@ static int qgroup_update_counters(struct btrfs_fs_info *fs_info,
1652 } 1652 }
1653 } 1653 }
1654 1654
1655 /* For exclusive extent, free its reserved bytes too */
1656 if (nr_old_roots == 0 && nr_new_roots == 1 &&
1657 cur_new_count == nr_new_roots)
1658 qg->reserved -= num_bytes;
1659 if (dirty) 1655 if (dirty)
1660 qgroup_dirty(fs_info, qg); 1656 qgroup_dirty(fs_info, qg);
1661 } 1657 }
@@ -2035,7 +2031,7 @@ out:
2035 return ret; 2031 return ret;
2036} 2032}
2037 2033
2038int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes) 2034static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes)
2039{ 2035{
2040 struct btrfs_root *quota_root; 2036 struct btrfs_root *quota_root;
2041 struct btrfs_qgroup *qgroup; 2037 struct btrfs_qgroup *qgroup;
@@ -2116,14 +2112,13 @@ out:
2116 return ret; 2112 return ret;
2117} 2113}
2118 2114
2119void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes) 2115void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
2116 u64 ref_root, u64 num_bytes)
2120{ 2117{
2121 struct btrfs_root *quota_root; 2118 struct btrfs_root *quota_root;
2122 struct btrfs_qgroup *qgroup; 2119 struct btrfs_qgroup *qgroup;
2123 struct btrfs_fs_info *fs_info = root->fs_info;
2124 struct ulist_node *unode; 2120 struct ulist_node *unode;
2125 struct ulist_iterator uiter; 2121 struct ulist_iterator uiter;
2126 u64 ref_root = root->root_key.objectid;
2127 int ret = 0; 2122 int ret = 0;
2128 2123
2129 if (!is_fstree(ref_root)) 2124 if (!is_fstree(ref_root))
@@ -2169,6 +2164,11 @@ out:
2169 spin_unlock(&fs_info->qgroup_lock); 2164 spin_unlock(&fs_info->qgroup_lock);
2170} 2165}
2171 2166
2167static inline void qgroup_free(struct btrfs_root *root, u64 num_bytes)
2168{
2169 return btrfs_qgroup_free_refroot(root->fs_info, root->objectid,
2170 num_bytes);
2171}
2172void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) 2172void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
2173{ 2173{
2174 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) 2174 if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq)
@@ -2188,10 +2188,10 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans)
2188 */ 2188 */
2189static int 2189static int
2190qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, 2190qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2191 struct btrfs_trans_handle *trans, 2191 struct btrfs_trans_handle *trans)
2192 struct extent_buffer *scratch_leaf)
2193{ 2192{
2194 struct btrfs_key found; 2193 struct btrfs_key found;
2194 struct extent_buffer *scratch_leaf = NULL;
2195 struct ulist *roots = NULL; 2195 struct ulist *roots = NULL;
2196 struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); 2196 struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
2197 u64 num_bytes; 2197 u64 num_bytes;
@@ -2229,7 +2229,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2229 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1; 2229 fs_info->qgroup_rescan_progress.objectid = found.objectid + 1;
2230 2230
2231 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); 2231 btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
2232 memcpy(scratch_leaf, path->nodes[0], sizeof(*scratch_leaf)); 2232 scratch_leaf = btrfs_clone_extent_buffer(path->nodes[0]);
2233 if (!scratch_leaf) {
2234 ret = -ENOMEM;
2235 mutex_unlock(&fs_info->qgroup_rescan_lock);
2236 goto out;
2237 }
2238 extent_buffer_get(scratch_leaf);
2239 btrfs_tree_read_lock(scratch_leaf);
2240 btrfs_set_lock_blocking_rw(scratch_leaf, BTRFS_READ_LOCK);
2233 slot = path->slots[0]; 2241 slot = path->slots[0];
2234 btrfs_release_path(path); 2242 btrfs_release_path(path);
2235 mutex_unlock(&fs_info->qgroup_rescan_lock); 2243 mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -2255,6 +2263,10 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
2255 goto out; 2263 goto out;
2256 } 2264 }
2257out: 2265out:
2266 if (scratch_leaf) {
2267 btrfs_tree_read_unlock_blocking(scratch_leaf);
2268 free_extent_buffer(scratch_leaf);
2269 }
2258 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); 2270 btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
2259 2271
2260 return ret; 2272 return ret;
@@ -2266,16 +2278,12 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2266 qgroup_rescan_work); 2278 qgroup_rescan_work);
2267 struct btrfs_path *path; 2279 struct btrfs_path *path;
2268 struct btrfs_trans_handle *trans = NULL; 2280 struct btrfs_trans_handle *trans = NULL;
2269 struct extent_buffer *scratch_leaf = NULL;
2270 int err = -ENOMEM; 2281 int err = -ENOMEM;
2271 int ret = 0; 2282 int ret = 0;
2272 2283
2273 path = btrfs_alloc_path(); 2284 path = btrfs_alloc_path();
2274 if (!path) 2285 if (!path)
2275 goto out; 2286 goto out;
2276 scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS);
2277 if (!scratch_leaf)
2278 goto out;
2279 2287
2280 err = 0; 2288 err = 0;
2281 while (!err) { 2289 while (!err) {
@@ -2287,8 +2295,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2287 if (!fs_info->quota_enabled) { 2295 if (!fs_info->quota_enabled) {
2288 err = -EINTR; 2296 err = -EINTR;
2289 } else { 2297 } else {
2290 err = qgroup_rescan_leaf(fs_info, path, trans, 2298 err = qgroup_rescan_leaf(fs_info, path, trans);
2291 scratch_leaf);
2292 } 2299 }
2293 if (err > 0) 2300 if (err > 0)
2294 btrfs_commit_transaction(trans, fs_info->fs_root); 2301 btrfs_commit_transaction(trans, fs_info->fs_root);
@@ -2297,7 +2304,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
2297 } 2304 }
2298 2305
2299out: 2306out:
2300 kfree(scratch_leaf);
2301 btrfs_free_path(path); 2307 btrfs_free_path(path);
2302 2308
2303 mutex_lock(&fs_info->qgroup_rescan_lock); 2309 mutex_lock(&fs_info->qgroup_rescan_lock);
@@ -2486,3 +2492,190 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
2486 btrfs_queue_work(fs_info->qgroup_rescan_workers, 2492 btrfs_queue_work(fs_info->qgroup_rescan_workers,
2487 &fs_info->qgroup_rescan_work); 2493 &fs_info->qgroup_rescan_work);
2488} 2494}
2495
2496/*
2497 * Reserve qgroup space for range [start, start + len).
2498 *
2499 * This function will either reserve space from related qgroups or doing
2500 * nothing if the range is already reserved.
2501 *
2502 * Return 0 for successful reserve
2503 * Return <0 for error (including -EQUOT)
2504 *
2505 * NOTE: this function may sleep for memory allocation.
2506 */
2507int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len)
2508{
2509 struct btrfs_root *root = BTRFS_I(inode)->root;
2510 struct extent_changeset changeset;
2511 struct ulist_node *unode;
2512 struct ulist_iterator uiter;
2513 int ret;
2514
2515 if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
2516 len == 0)
2517 return 0;
2518
2519 changeset.bytes_changed = 0;
2520 changeset.range_changed = ulist_alloc(GFP_NOFS);
2521 ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
2522 start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
2523 &changeset);
2524 trace_btrfs_qgroup_reserve_data(inode, start, len,
2525 changeset.bytes_changed,
2526 QGROUP_RESERVE);
2527 if (ret < 0)
2528 goto cleanup;
2529 ret = qgroup_reserve(root, changeset.bytes_changed);
2530 if (ret < 0)
2531 goto cleanup;
2532
2533 ulist_free(changeset.range_changed);
2534 return ret;
2535
2536cleanup:
2537 /* cleanup already reserved ranges */
2538 ULIST_ITER_INIT(&uiter);
2539 while ((unode = ulist_next(changeset.range_changed, &uiter)))
2540 clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
2541 unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL,
2542 GFP_NOFS);
2543 ulist_free(changeset.range_changed);
2544 return ret;
2545}
2546
2547static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len,
2548 int free)
2549{
2550 struct extent_changeset changeset;
2551 int trace_op = QGROUP_RELEASE;
2552 int ret;
2553
2554 changeset.bytes_changed = 0;
2555 changeset.range_changed = ulist_alloc(GFP_NOFS);
2556 if (!changeset.range_changed)
2557 return -ENOMEM;
2558
2559 ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
2560 start + len -1, EXTENT_QGROUP_RESERVED, GFP_NOFS,
2561 &changeset);
2562 if (ret < 0)
2563 goto out;
2564
2565 if (free) {
2566 qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
2567 trace_op = QGROUP_FREE;
2568 }
2569 trace_btrfs_qgroup_release_data(inode, start, len,
2570 changeset.bytes_changed, trace_op);
2571out:
2572 ulist_free(changeset.range_changed);
2573 return ret;
2574}
2575
2576/*
2577 * Free a reserved space range from io_tree and related qgroups
2578 *
2579 * Should be called when a range of pages get invalidated before reaching disk.
2580 * Or for error cleanup case.
2581 *
2582 * For data written to disk, use btrfs_qgroup_release_data().
2583 *
2584 * NOTE: This function may sleep for memory allocation.
2585 */
2586int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len)
2587{
2588 return __btrfs_qgroup_release_data(inode, start, len, 1);
2589}
2590
2591/*
2592 * Release a reserved space range from io_tree only.
2593 *
2594 * Should be called when a range of pages get written to disk and corresponding
2595 * FILE_EXTENT is inserted into corresponding root.
2596 *
2597 * Since new qgroup accounting framework will only update qgroup numbers at
2598 * commit_transaction() time, its reserved space shouldn't be freed from
2599 * related qgroups.
2600 *
2601 * But we should release the range from io_tree, to allow further write to be
2602 * COWed.
2603 *
2604 * NOTE: This function may sleep for memory allocation.
2605 */
2606int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len)
2607{
2608 return __btrfs_qgroup_release_data(inode, start, len, 0);
2609}
2610
2611int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes)
2612{
2613 int ret;
2614
2615 if (!root->fs_info->quota_enabled || !is_fstree(root->objectid) ||
2616 num_bytes == 0)
2617 return 0;
2618
2619 BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
2620 ret = qgroup_reserve(root, num_bytes);
2621 if (ret < 0)
2622 return ret;
2623 atomic_add(num_bytes, &root->qgroup_meta_rsv);
2624 return ret;
2625}
2626
2627void btrfs_qgroup_free_meta_all(struct btrfs_root *root)
2628{
2629 int reserved;
2630
2631 if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
2632 return;
2633
2634 reserved = atomic_xchg(&root->qgroup_meta_rsv, 0);
2635 if (reserved == 0)
2636 return;
2637 qgroup_free(root, reserved);
2638}
2639
2640void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes)
2641{
2642 if (!root->fs_info->quota_enabled || !is_fstree(root->objectid))
2643 return;
2644
2645 BUG_ON(num_bytes != round_down(num_bytes, root->nodesize));
2646 WARN_ON(atomic_read(&root->qgroup_meta_rsv) < num_bytes);
2647 atomic_sub(num_bytes, &root->qgroup_meta_rsv);
2648 qgroup_free(root, num_bytes);
2649}
2650
2651/*
2652 * Check qgroup reserved space leaking, normally at destory inode
2653 * time
2654 */
2655void btrfs_qgroup_check_reserved_leak(struct inode *inode)
2656{
2657 struct extent_changeset changeset;
2658 struct ulist_node *unode;
2659 struct ulist_iterator iter;
2660 int ret;
2661
2662 changeset.bytes_changed = 0;
2663 changeset.range_changed = ulist_alloc(GFP_NOFS);
2664 if (WARN_ON(!changeset.range_changed))
2665 return;
2666
2667 ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
2668 EXTENT_QGROUP_RESERVED, GFP_NOFS, &changeset);
2669
2670 WARN_ON(ret < 0);
2671 if (WARN_ON(changeset.bytes_changed)) {
2672 ULIST_ITER_INIT(&iter);
2673 while ((unode = ulist_next(changeset.range_changed, &iter))) {
2674 btrfs_warn(BTRFS_I(inode)->root->fs_info,
2675 "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
2676 inode->i_ino, unode->val, unode->aux);
2677 }
2678 qgroup_free(BTRFS_I(inode)->root, changeset.bytes_changed);
2679 }
2680 ulist_free(changeset.range_changed);
2681}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 6387dcfa354c..ecb2c143ef75 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -33,6 +33,13 @@ struct btrfs_qgroup_extent_record {
33 struct ulist *old_roots; 33 struct ulist *old_roots;
34}; 34};
35 35
36/*
37 * For qgroup event trace points only
38 */
39#define QGROUP_RESERVE (1<<0)
40#define QGROUP_RELEASE (1<<1)
41#define QGROUP_FREE (1<<2)
42
36int btrfs_quota_enable(struct btrfs_trans_handle *trans, 43int btrfs_quota_enable(struct btrfs_trans_handle *trans,
37 struct btrfs_fs_info *fs_info); 44 struct btrfs_fs_info *fs_info);
38int btrfs_quota_disable(struct btrfs_trans_handle *trans, 45int btrfs_quota_disable(struct btrfs_trans_handle *trans,
@@ -71,9 +78,18 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
71int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, 78int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
72 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid, 79 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
73 struct btrfs_qgroup_inherit *inherit); 80 struct btrfs_qgroup_inherit *inherit);
74int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes); 81void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
75void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes); 82 u64 ref_root, u64 num_bytes);
76 83/*
84 * TODO: Add proper trace point for it, as btrfs_qgroup_free() is
85 * called by everywhere, can't provide good trace for delayed ref case.
86 */
87static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
88 u64 ref_root, u64 num_bytes)
89{
90 btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes);
91 trace_btrfs_qgroup_free_delayed_ref(ref_root, num_bytes);
92}
77void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); 93void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
78 94
79#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS 95#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -81,4 +97,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
81 u64 rfer, u64 excl); 97 u64 rfer, u64 excl);
82#endif 98#endif
83 99
100/* New io_tree based accurate qgroup reserve API */
101int btrfs_qgroup_reserve_data(struct inode *inode, u64 start, u64 len);
102int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
103int btrfs_qgroup_free_data(struct inode *inode, u64 start, u64 len);
104
105int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes);
106void btrfs_qgroup_free_meta_all(struct btrfs_root *root);
107void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes);
108void btrfs_qgroup_check_reserved_leak(struct inode *inode);
84#endif /* __BTRFS_QGROUP__ */ 109#endif /* __BTRFS_QGROUP__ */
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index fcf7265ca46f..1a33d3eb36de 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -810,7 +810,11 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
810 } 810 }
811 811
812 goto done_nolock; 812 goto done_nolock;
813 } else if (waitqueue_active(&h->wait)) { 813 /*
814 * The barrier for this waitqueue_active is not needed,
815 * we're protected by h->lock and can't miss a wakeup.
816 */
817 } else if (waitqueue_active(&h->wait)) {
814 spin_unlock(&rbio->bio_list_lock); 818 spin_unlock(&rbio->bio_list_lock);
815 spin_unlock_irqrestore(&h->lock, flags); 819 spin_unlock_irqrestore(&h->lock, flags);
816 wake_up(&h->wait); 820 wake_up(&h->wait);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 4645cd16d5ba..619f92963e27 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -569,7 +569,7 @@ static int reada_add_block(struct reada_control *rc, u64 logical,
569 rec = kzalloc(sizeof(*rec), GFP_NOFS); 569 rec = kzalloc(sizeof(*rec), GFP_NOFS);
570 if (!rec) { 570 if (!rec) {
571 reada_extent_put(root->fs_info, re); 571 reada_extent_put(root->fs_info, re);
572 return -1; 572 return -ENOMEM;
573 } 573 }
574 574
575 rec->rc = rc; 575 rec->rc = rc;
@@ -918,6 +918,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
918 u64 start; 918 u64 start;
919 u64 generation; 919 u64 generation;
920 int level; 920 int level;
921 int ret;
921 struct extent_buffer *node; 922 struct extent_buffer *node;
922 static struct btrfs_key max_key = { 923 static struct btrfs_key max_key = {
923 .objectid = (u64)-1, 924 .objectid = (u64)-1,
@@ -943,9 +944,10 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root,
943 generation = btrfs_header_generation(node); 944 generation = btrfs_header_generation(node);
944 free_extent_buffer(node); 945 free_extent_buffer(node);
945 946
946 if (reada_add_block(rc, start, &max_key, level, generation)) { 947 ret = reada_add_block(rc, start, &max_key, level, generation);
948 if (ret) {
947 kfree(rc); 949 kfree(rc);
948 return ERR_PTR(-ENOMEM); 950 return ERR_PTR(ret);
949 } 951 }
950 952
951 reada_start_machine(root->fs_info); 953 reada_start_machine(root->fs_info);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 303babeef505..b4ca5454ef1a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1716,7 +1716,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1716 ret = btrfs_inc_extent_ref(trans, root, new_bytenr, 1716 ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
1717 num_bytes, parent, 1717 num_bytes, parent,
1718 btrfs_header_owner(leaf), 1718 btrfs_header_owner(leaf),
1719 key.objectid, key.offset, 1); 1719 key.objectid, key.offset);
1720 if (ret) { 1720 if (ret) {
1721 btrfs_abort_transaction(trans, root, ret); 1721 btrfs_abort_transaction(trans, root, ret);
1722 break; 1722 break;
@@ -1724,7 +1724,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1724 1724
1725 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1725 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
1726 parent, btrfs_header_owner(leaf), 1726 parent, btrfs_header_owner(leaf),
1727 key.objectid, key.offset, 1); 1727 key.objectid, key.offset);
1728 if (ret) { 1728 if (ret) {
1729 btrfs_abort_transaction(trans, root, ret); 1729 btrfs_abort_transaction(trans, root, ret);
1730 break; 1730 break;
@@ -1900,23 +1900,21 @@ again:
1900 1900
1901 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize, 1901 ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
1902 path->nodes[level]->start, 1902 path->nodes[level]->start,
1903 src->root_key.objectid, level - 1, 0, 1903 src->root_key.objectid, level - 1, 0);
1904 1);
1905 BUG_ON(ret); 1904 BUG_ON(ret);
1906 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize, 1905 ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
1907 0, dest->root_key.objectid, level - 1, 1906 0, dest->root_key.objectid, level - 1,
1908 0, 1); 1907 0);
1909 BUG_ON(ret); 1908 BUG_ON(ret);
1910 1909
1911 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize, 1910 ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
1912 path->nodes[level]->start, 1911 path->nodes[level]->start,
1913 src->root_key.objectid, level - 1, 0, 1912 src->root_key.objectid, level - 1, 0);
1914 1);
1915 BUG_ON(ret); 1913 BUG_ON(ret);
1916 1914
1917 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize, 1915 ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
1918 0, dest->root_key.objectid, level - 1, 1916 0, dest->root_key.objectid, level - 1,
1919 0, 1); 1917 0);
1920 BUG_ON(ret); 1918 BUG_ON(ret);
1921 1919
1922 btrfs_unlock_up_safe(path, 0); 1920 btrfs_unlock_up_safe(path, 0);
@@ -2418,7 +2416,7 @@ again:
2418 } 2416 }
2419out: 2417out:
2420 if (ret) { 2418 if (ret) {
2421 btrfs_std_error(root->fs_info, ret); 2419 btrfs_std_error(root->fs_info, ret, NULL);
2422 if (!list_empty(&reloc_roots)) 2420 if (!list_empty(&reloc_roots))
2423 free_reloc_roots(&reloc_roots); 2421 free_reloc_roots(&reloc_roots);
2424 2422
@@ -2745,7 +2743,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2745 node->eb->start, blocksize, 2743 node->eb->start, blocksize,
2746 upper->eb->start, 2744 upper->eb->start,
2747 btrfs_header_owner(upper->eb), 2745 btrfs_header_owner(upper->eb),
2748 node->level, 0, 1); 2746 node->level, 0);
2749 BUG_ON(ret); 2747 BUG_ON(ret);
2750 2748
2751 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2749 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -3034,8 +3032,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
3034 BUG_ON(cluster->start != cluster->boundary[0]); 3032 BUG_ON(cluster->start != cluster->boundary[0]);
3035 mutex_lock(&inode->i_mutex); 3033 mutex_lock(&inode->i_mutex);
3036 3034
3037 ret = btrfs_check_data_free_space(inode, cluster->end + 3035 ret = btrfs_check_data_free_space(inode, cluster->start,
3038 1 - cluster->start, 0); 3036 cluster->end + 1 - cluster->start);
3039 if (ret) 3037 if (ret)
3040 goto out; 3038 goto out;
3041 3039
@@ -3056,8 +3054,8 @@ int prealloc_file_extent_cluster(struct inode *inode,
3056 break; 3054 break;
3057 nr++; 3055 nr++;
3058 } 3056 }
3059 btrfs_free_reserved_data_space(inode, cluster->end + 3057 btrfs_free_reserved_data_space(inode, cluster->start,
3060 1 - cluster->start); 3058 cluster->end + 1 - cluster->start);
3061out: 3059out:
3062 mutex_unlock(&inode->i_mutex); 3060 mutex_unlock(&inode->i_mutex);
3063 return ret; 3061 return ret;
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 360a728a639f..7cf8509deda7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -45,12 +45,13 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
45 if (!need_reset && btrfs_root_generation(item) 45 if (!need_reset && btrfs_root_generation(item)
46 != btrfs_root_generation_v2(item)) { 46 != btrfs_root_generation_v2(item)) {
47 if (btrfs_root_generation_v2(item) != 0) { 47 if (btrfs_root_generation_v2(item) != 0) {
48 printk(KERN_WARNING "BTRFS: mismatching " 48 btrfs_warn(eb->fs_info,
49 "mismatching "
49 "generation and generation_v2 " 50 "generation and generation_v2 "
50 "found in root item. This root " 51 "found in root item. This root "
51 "was probably mounted with an " 52 "was probably mounted with an "
52 "older kernel. Resetting all " 53 "older kernel. Resetting all "
53 "new fields.\n"); 54 "new fields.");
54 } 55 }
55 need_reset = 1; 56 need_reset = 1;
56 } 57 }
@@ -141,7 +142,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
141 int ret; 142 int ret;
142 int slot; 143 int slot;
143 unsigned long ptr; 144 unsigned long ptr;
144 int old_len; 145 u32 old_len;
145 146
146 path = btrfs_alloc_path(); 147 path = btrfs_alloc_path();
147 if (!path) 148 if (!path)
@@ -283,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
283 trans = btrfs_join_transaction(tree_root); 284 trans = btrfs_join_transaction(tree_root);
284 if (IS_ERR(trans)) { 285 if (IS_ERR(trans)) {
285 err = PTR_ERR(trans); 286 err = PTR_ERR(trans);
286 btrfs_error(tree_root->fs_info, err, 287 btrfs_std_error(tree_root->fs_info, err,
287 "Failed to start trans to delete " 288 "Failed to start trans to delete "
288 "orphan item"); 289 "orphan item");
289 break; 290 break;
@@ -292,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
292 root_key.objectid); 293 root_key.objectid);
293 btrfs_end_transaction(trans, tree_root); 294 btrfs_end_transaction(trans, tree_root);
294 if (err) { 295 if (err) {
295 btrfs_error(tree_root->fs_info, err, 296 btrfs_std_error(tree_root->fs_info, err,
296 "Failed to delete root orphan " 297 "Failed to delete root orphan "
297 "item"); 298 "item");
298 break; 299 break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a39f5d1144e8..550de89a8661 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -580,9 +580,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
580 * hold all of the paths here 580 * hold all of the paths here
581 */ 581 */
582 for (i = 0; i < ipath->fspath->elem_cnt; ++i) 582 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
583 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " 583 btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
584 "%s, sector %llu, root %llu, inode %llu, offset %llu, " 584 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
585 "length %llu, links %u (path: %s)\n", swarn->errstr, 585 "length %llu, links %u (path: %s)", swarn->errstr,
586 swarn->logical, rcu_str_deref(swarn->dev->name), 586 swarn->logical, rcu_str_deref(swarn->dev->name),
587 (unsigned long long)swarn->sector, root, inum, offset, 587 (unsigned long long)swarn->sector, root, inum, offset,
588 min(isize - offset, (u64)PAGE_SIZE), nlink, 588 min(isize - offset, (u64)PAGE_SIZE), nlink,
@@ -592,9 +592,9 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
592 return 0; 592 return 0;
593 593
594err: 594err:
595 printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev " 595 btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev "
596 "%s, sector %llu, root %llu, inode %llu, offset %llu: path " 596 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
597 "resolving failed with ret=%d\n", swarn->errstr, 597 "resolving failed with ret=%d", swarn->errstr,
598 swarn->logical, rcu_str_deref(swarn->dev->name), 598 swarn->logical, rcu_str_deref(swarn->dev->name),
599 (unsigned long long)swarn->sector, root, inum, offset, ret); 599 (unsigned long long)swarn->sector, root, inum, offset, ret);
600 600
@@ -649,10 +649,10 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
649 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei, 649 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
650 item_size, &ref_root, 650 item_size, &ref_root,
651 &ref_level); 651 &ref_level);
652 printk_in_rcu(KERN_WARNING 652 btrfs_warn_in_rcu(fs_info,
653 "BTRFS: %s at logical %llu on dev %s, " 653 "%s at logical %llu on dev %s, "
654 "sector %llu: metadata %s (level %d) in tree " 654 "sector %llu: metadata %s (level %d) in tree "
655 "%llu\n", errstr, swarn.logical, 655 "%llu", errstr, swarn.logical,
656 rcu_str_deref(dev->name), 656 rcu_str_deref(dev->name),
657 (unsigned long long)swarn.sector, 657 (unsigned long long)swarn.sector,
658 ref_level ? "node" : "leaf", 658 ref_level ? "node" : "leaf",
@@ -850,8 +850,8 @@ out:
850 btrfs_dev_replace_stats_inc( 850 btrfs_dev_replace_stats_inc(
851 &sctx->dev_root->fs_info->dev_replace. 851 &sctx->dev_root->fs_info->dev_replace.
852 num_uncorrectable_read_errors); 852 num_uncorrectable_read_errors);
853 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " 853 btrfs_err_rl_in_rcu(sctx->dev_root->fs_info,
854 "unable to fixup (nodatasum) error at logical %llu on dev %s\n", 854 "unable to fixup (nodatasum) error at logical %llu on dev %s",
855 fixup->logical, rcu_str_deref(fixup->dev->name)); 855 fixup->logical, rcu_str_deref(fixup->dev->name));
856 } 856 }
857 857
@@ -1230,8 +1230,8 @@ corrected_error:
1230 sctx->stat.corrected_errors++; 1230 sctx->stat.corrected_errors++;
1231 sblock_to_check->data_corrected = 1; 1231 sblock_to_check->data_corrected = 1;
1232 spin_unlock(&sctx->stat_lock); 1232 spin_unlock(&sctx->stat_lock);
1233 printk_ratelimited_in_rcu(KERN_ERR 1233 btrfs_err_rl_in_rcu(fs_info,
1234 "BTRFS: fixed up error at logical %llu on dev %s\n", 1234 "fixed up error at logical %llu on dev %s",
1235 logical, rcu_str_deref(dev->name)); 1235 logical, rcu_str_deref(dev->name));
1236 } 1236 }
1237 } else { 1237 } else {
@@ -1239,8 +1239,8 @@ did_not_correct_error:
1239 spin_lock(&sctx->stat_lock); 1239 spin_lock(&sctx->stat_lock);
1240 sctx->stat.uncorrectable_errors++; 1240 sctx->stat.uncorrectable_errors++;
1241 spin_unlock(&sctx->stat_lock); 1241 spin_unlock(&sctx->stat_lock);
1242 printk_ratelimited_in_rcu(KERN_ERR 1242 btrfs_err_rl_in_rcu(fs_info,
1243 "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n", 1243 "unable to fixup (regular) error at logical %llu on dev %s",
1244 logical, rcu_str_deref(dev->name)); 1244 logical, rcu_str_deref(dev->name));
1245 } 1245 }
1246 1246
@@ -1626,9 +1626,9 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1626 int ret; 1626 int ret;
1627 1627
1628 if (!page_bad->dev->bdev) { 1628 if (!page_bad->dev->bdev) {
1629 printk_ratelimited(KERN_WARNING "BTRFS: " 1629 btrfs_warn_rl(sblock_bad->sctx->dev_root->fs_info,
1630 "scrub_repair_page_from_good_copy(bdev == NULL) " 1630 "scrub_repair_page_from_good_copy(bdev == NULL) "
1631 "is unexpected!\n"); 1631 "is unexpected");
1632 return -EIO; 1632 return -EIO;
1633 } 1633 }
1634 1634
@@ -2201,15 +2201,15 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
2201 spin_lock(&sctx->stat_lock); 2201 spin_lock(&sctx->stat_lock);
2202 sctx->stat.read_errors++; 2202 sctx->stat.read_errors++;
2203 spin_unlock(&sctx->stat_lock); 2203 spin_unlock(&sctx->stat_lock);
2204 printk_ratelimited_in_rcu(KERN_ERR 2204 btrfs_err_rl_in_rcu(fs_info,
2205 "BTRFS: I/O error rebulding logical %llu for dev %s\n", 2205 "IO error rebuilding logical %llu for dev %s",
2206 logical, rcu_str_deref(dev->name)); 2206 logical, rcu_str_deref(dev->name));
2207 } else if (sblock->header_error || sblock->checksum_error) { 2207 } else if (sblock->header_error || sblock->checksum_error) {
2208 spin_lock(&sctx->stat_lock); 2208 spin_lock(&sctx->stat_lock);
2209 sctx->stat.uncorrectable_errors++; 2209 sctx->stat.uncorrectable_errors++;
2210 spin_unlock(&sctx->stat_lock); 2210 spin_unlock(&sctx->stat_lock);
2211 printk_ratelimited_in_rcu(KERN_ERR 2211 btrfs_err_rl_in_rcu(fs_info,
2212 "BTRFS: failed to rebuild valid logical %llu for dev %s\n", 2212 "failed to rebuild valid logical %llu for dev %s",
2213 logical, rcu_str_deref(dev->name)); 2213 logical, rcu_str_deref(dev->name));
2214 } else { 2214 } else {
2215 scrub_write_block_to_dev_replace(sblock); 2215 scrub_write_block_to_dev_replace(sblock);
@@ -4375,8 +4375,8 @@ static int write_page_nocow(struct scrub_ctx *sctx,
4375 if (!dev) 4375 if (!dev)
4376 return -EIO; 4376 return -EIO;
4377 if (!dev->bdev) { 4377 if (!dev->bdev) {
4378 printk_ratelimited(KERN_WARNING 4378 btrfs_warn_rl(dev->dev_root->fs_info,
4379 "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n"); 4379 "scrub write_page_nocow(bdev == NULL) is unexpected");
4380 return -EIO; 4380 return -EIO;
4381 } 4381 }
4382 bio = btrfs_io_bio_alloc(GFP_NOFS, 1); 4382 bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index a739b825bdd3..355a458cba1a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1434,16 +1434,6 @@ verbose_printk(KERN_DEBUG "btrfs: find_extent_clone: data_offset=%llu, "
1434 } 1434 }
1435 1435
1436 if (cur_clone_root) { 1436 if (cur_clone_root) {
1437 if (compressed != BTRFS_COMPRESS_NONE) {
1438 /*
1439 * Offsets given by iterate_extent_inodes() are relative
1440 * to the start of the extent, we need to add logical
1441 * offset from the file extent item.
1442 * (See why at backref.c:check_extent_in_eb())
1443 */
1444 cur_clone_root->offset += btrfs_file_extent_offset(eb,
1445 fi);
1446 }
1447 *found = cur_clone_root; 1437 *found = cur_clone_root;
1448 ret = 0; 1438 ret = 0;
1449 } else { 1439 } else {
@@ -2353,8 +2343,14 @@ static int send_subvol_begin(struct send_ctx *sctx)
2353 } 2343 }
2354 2344
2355 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen); 2345 TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2356 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID, 2346
2357 sctx->send_root->root_item.uuid); 2347 if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
2348 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2349 sctx->send_root->root_item.received_uuid);
2350 else
2351 TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
2352 sctx->send_root->root_item.uuid);
2353
2358 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, 2354 TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2359 le64_to_cpu(sctx->send_root->root_item.ctransid)); 2355 le64_to_cpu(sctx->send_root->root_item.ctransid));
2360 if (parent_root) { 2356 if (parent_root) {
@@ -2564,7 +2560,7 @@ verbose_printk("btrfs: send_create_inode %llu\n", ino);
2564 } else if (S_ISSOCK(mode)) { 2560 } else if (S_ISSOCK(mode)) {
2565 cmd = BTRFS_SEND_C_MKSOCK; 2561 cmd = BTRFS_SEND_C_MKSOCK;
2566 } else { 2562 } else {
2567 printk(KERN_WARNING "btrfs: unexpected inode type %o", 2563 btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2568 (int)(mode & S_IFMT)); 2564 (int)(mode & S_IFMT));
2569 ret = -ENOTSUPP; 2565 ret = -ENOTSUPP;
2570 goto out; 2566 goto out;
@@ -4687,6 +4683,171 @@ tlv_put_failure:
4687 return ret; 4683 return ret;
4688} 4684}
4689 4685
4686static int send_extent_data(struct send_ctx *sctx,
4687 const u64 offset,
4688 const u64 len)
4689{
4690 u64 sent = 0;
4691
4692 if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
4693 return send_update_extent(sctx, offset, len);
4694
4695 while (sent < len) {
4696 u64 size = len - sent;
4697 int ret;
4698
4699 if (size > BTRFS_SEND_READ_SIZE)
4700 size = BTRFS_SEND_READ_SIZE;
4701 ret = send_write(sctx, offset + sent, size);
4702 if (ret < 0)
4703 return ret;
4704 if (!ret)
4705 break;
4706 sent += ret;
4707 }
4708 return 0;
4709}
4710
4711static int clone_range(struct send_ctx *sctx,
4712 struct clone_root *clone_root,
4713 const u64 disk_byte,
4714 u64 data_offset,
4715 u64 offset,
4716 u64 len)
4717{
4718 struct btrfs_path *path;
4719 struct btrfs_key key;
4720 int ret;
4721
4722 path = alloc_path_for_send();
4723 if (!path)
4724 return -ENOMEM;
4725
4726 /*
4727 * We can't send a clone operation for the entire range if we find
4728 * extent items in the respective range in the source file that
4729 * refer to different extents or if we find holes.
4730 * So check for that and do a mix of clone and regular write/copy
4731 * operations if needed.
4732 *
4733 * Example:
4734 *
4735 * mkfs.btrfs -f /dev/sda
4736 * mount /dev/sda /mnt
4737 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
4738 * cp --reflink=always /mnt/foo /mnt/bar
4739 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
4740 * btrfs subvolume snapshot -r /mnt /mnt/snap
4741 *
4742 * If when we send the snapshot and we are processing file bar (which
4743 * has a higher inode number than foo) we blindly send a clone operation
4744 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
4745 * a file bar that matches the content of file foo - iow, doesn't match
4746 * the content from bar in the original filesystem.
4747 */
4748 key.objectid = clone_root->ino;
4749 key.type = BTRFS_EXTENT_DATA_KEY;
4750 key.offset = clone_root->offset;
4751 ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
4752 if (ret < 0)
4753 goto out;
4754 if (ret > 0 && path->slots[0] > 0) {
4755 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
4756 if (key.objectid == clone_root->ino &&
4757 key.type == BTRFS_EXTENT_DATA_KEY)
4758 path->slots[0]--;
4759 }
4760
4761 while (true) {
4762 struct extent_buffer *leaf = path->nodes[0];
4763 int slot = path->slots[0];
4764 struct btrfs_file_extent_item *ei;
4765 u8 type;
4766 u64 ext_len;
4767 u64 clone_len;
4768
4769 if (slot >= btrfs_header_nritems(leaf)) {
4770 ret = btrfs_next_leaf(clone_root->root, path);
4771 if (ret < 0)
4772 goto out;
4773 else if (ret > 0)
4774 break;
4775 continue;
4776 }
4777
4778 btrfs_item_key_to_cpu(leaf, &key, slot);
4779
4780 /*
4781 * We might have an implicit trailing hole (NO_HOLES feature
4782 * enabled). We deal with it after leaving this loop.
4783 */
4784 if (key.objectid != clone_root->ino ||
4785 key.type != BTRFS_EXTENT_DATA_KEY)
4786 break;
4787
4788 ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4789 type = btrfs_file_extent_type(leaf, ei);
4790 if (type == BTRFS_FILE_EXTENT_INLINE) {
4791 ext_len = btrfs_file_extent_inline_len(leaf, slot, ei);
4792 ext_len = PAGE_CACHE_ALIGN(ext_len);
4793 } else {
4794 ext_len = btrfs_file_extent_num_bytes(leaf, ei);
4795 }
4796
4797 if (key.offset + ext_len <= clone_root->offset)
4798 goto next;
4799
4800 if (key.offset > clone_root->offset) {
4801 /* Implicit hole, NO_HOLES feature enabled. */
4802 u64 hole_len = key.offset - clone_root->offset;
4803
4804 if (hole_len > len)
4805 hole_len = len;
4806 ret = send_extent_data(sctx, offset, hole_len);
4807 if (ret < 0)
4808 goto out;
4809
4810 len -= hole_len;
4811 if (len == 0)
4812 break;
4813 offset += hole_len;
4814 clone_root->offset += hole_len;
4815 data_offset += hole_len;
4816 }
4817
4818 if (key.offset >= clone_root->offset + len)
4819 break;
4820
4821 clone_len = min_t(u64, ext_len, len);
4822
4823 if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
4824 btrfs_file_extent_offset(leaf, ei) == data_offset)
4825 ret = send_clone(sctx, offset, clone_len, clone_root);
4826 else
4827 ret = send_extent_data(sctx, offset, clone_len);
4828
4829 if (ret < 0)
4830 goto out;
4831
4832 len -= clone_len;
4833 if (len == 0)
4834 break;
4835 offset += clone_len;
4836 clone_root->offset += clone_len;
4837 data_offset += clone_len;
4838next:
4839 path->slots[0]++;
4840 }
4841
4842 if (len > 0)
4843 ret = send_extent_data(sctx, offset, len);
4844 else
4845 ret = 0;
4846out:
4847 btrfs_free_path(path);
4848 return ret;
4849}
4850
4690static int send_write_or_clone(struct send_ctx *sctx, 4851static int send_write_or_clone(struct send_ctx *sctx,
4691 struct btrfs_path *path, 4852 struct btrfs_path *path,
4692 struct btrfs_key *key, 4853 struct btrfs_key *key,
@@ -4695,9 +4856,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
4695 int ret = 0; 4856 int ret = 0;
4696 struct btrfs_file_extent_item *ei; 4857 struct btrfs_file_extent_item *ei;
4697 u64 offset = key->offset; 4858 u64 offset = key->offset;
4698 u64 pos = 0;
4699 u64 len; 4859 u64 len;
4700 u32 l;
4701 u8 type; 4860 u8 type;
4702 u64 bs = sctx->send_root->fs_info->sb->s_blocksize; 4861 u64 bs = sctx->send_root->fs_info->sb->s_blocksize;
4703 4862
@@ -4725,22 +4884,15 @@ static int send_write_or_clone(struct send_ctx *sctx,
4725 } 4884 }
4726 4885
4727 if (clone_root && IS_ALIGNED(offset + len, bs)) { 4886 if (clone_root && IS_ALIGNED(offset + len, bs)) {
4728 ret = send_clone(sctx, offset, len, clone_root); 4887 u64 disk_byte;
4729 } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) { 4888 u64 data_offset;
4730 ret = send_update_extent(sctx, offset, len); 4889
4890 disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
4891 data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
4892 ret = clone_range(sctx, clone_root, disk_byte, data_offset,
4893 offset, len);
4731 } else { 4894 } else {
4732 while (pos < len) { 4895 ret = send_extent_data(sctx, offset, len);
4733 l = len - pos;
4734 if (l > BTRFS_SEND_READ_SIZE)
4735 l = BTRFS_SEND_READ_SIZE;
4736 ret = send_write(sctx, pos + offset, l);
4737 if (ret < 0)
4738 goto out;
4739 if (!ret)
4740 break;
4741 pos += ret;
4742 }
4743 ret = 0;
4744 } 4896 }
4745out: 4897out:
4746 return ret; 4898 return ret;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 11d1eab9234d..24154e422945 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -130,7 +130,6 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
130 } 130 }
131} 131}
132 132
133#ifdef CONFIG_PRINTK
134/* 133/*
135 * __btrfs_std_error decodes expected errors from the caller and 134 * __btrfs_std_error decodes expected errors from the caller and
136 * invokes the approciate error response. 135 * invokes the approciate error response.
@@ -140,7 +139,9 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
140 unsigned int line, int errno, const char *fmt, ...) 139 unsigned int line, int errno, const char *fmt, ...)
141{ 140{
142 struct super_block *sb = fs_info->sb; 141 struct super_block *sb = fs_info->sb;
142#ifdef CONFIG_PRINTK
143 const char *errstr; 143 const char *errstr;
144#endif
144 145
145 /* 146 /*
146 * Special case: if the error is EROFS, and we're already 147 * Special case: if the error is EROFS, and we're already
@@ -149,6 +150,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
149 if (errno == -EROFS && (sb->s_flags & MS_RDONLY)) 150 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
150 return; 151 return;
151 152
153#ifdef CONFIG_PRINTK
152 errstr = btrfs_decode_error(errno); 154 errstr = btrfs_decode_error(errno);
153 if (fmt) { 155 if (fmt) {
154 struct va_format vaf; 156 struct va_format vaf;
@@ -166,6 +168,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
166 printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n", 168 printk(KERN_CRIT "BTRFS: error (device %s) in %s:%d: errno=%d %s\n",
167 sb->s_id, function, line, errno, errstr); 169 sb->s_id, function, line, errno, errstr);
168 } 170 }
171#endif
169 172
170 /* Don't go through full error handling during mount */ 173 /* Don't go through full error handling during mount */
171 save_error_info(fs_info); 174 save_error_info(fs_info);
@@ -173,6 +176,7 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
173 btrfs_handle_error(fs_info); 176 btrfs_handle_error(fs_info);
174} 177}
175 178
179#ifdef CONFIG_PRINTK
176static const char * const logtypes[] = { 180static const char * const logtypes[] = {
177 "emergency", 181 "emergency",
178 "alert", 182 "alert",
@@ -212,27 +216,6 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
212 216
213 va_end(args); 217 va_end(args);
214} 218}
215
216#else
217
218void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
219 unsigned int line, int errno, const char *fmt, ...)
220{
221 struct super_block *sb = fs_info->sb;
222
223 /*
224 * Special case: if the error is EROFS, and we're already
225 * under MS_RDONLY, then it is safe here.
226 */
227 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
228 return;
229
230 /* Don't go through full error handling during mount */
231 if (sb->s_flags & MS_BORN) {
232 save_error_info(fs_info);
233 btrfs_handle_error(fs_info);
234 }
235}
236#endif 219#endif
237 220
238/* 221/*
@@ -320,6 +303,9 @@ enum {
320 Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, 303 Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
321 Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, 304 Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
322 Opt_datasum, Opt_treelog, Opt_noinode_cache, 305 Opt_datasum, Opt_treelog, Opt_noinode_cache,
306#ifdef CONFIG_BTRFS_DEBUG
307 Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
308#endif
323 Opt_err, 309 Opt_err,
324}; 310};
325 311
@@ -372,6 +358,11 @@ static match_table_t tokens = {
372 {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, 358 {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
373 {Opt_fatal_errors, "fatal_errors=%s"}, 359 {Opt_fatal_errors, "fatal_errors=%s"},
374 {Opt_commit_interval, "commit=%d"}, 360 {Opt_commit_interval, "commit=%d"},
361#ifdef CONFIG_BTRFS_DEBUG
362 {Opt_fragment_data, "fragment=data"},
363 {Opt_fragment_metadata, "fragment=metadata"},
364 {Opt_fragment_all, "fragment=all"},
365#endif
375 {Opt_err, NULL}, 366 {Opt_err, NULL},
376}; 367};
377 368
@@ -738,6 +729,22 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
738 info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; 729 info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
739 } 730 }
740 break; 731 break;
732#ifdef CONFIG_BTRFS_DEBUG
733 case Opt_fragment_all:
734 btrfs_info(root->fs_info, "fragmenting all space");
735 btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
736 btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
737 break;
738 case Opt_fragment_metadata:
739 btrfs_info(root->fs_info, "fragmenting metadata");
740 btrfs_set_opt(info->mount_opt,
741 FRAGMENT_METADATA);
742 break;
743 case Opt_fragment_data:
744 btrfs_info(root->fs_info, "fragmenting data");
745 btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
746 break;
747#endif
741 case Opt_err: 748 case Opt_err:
742 btrfs_info(root->fs_info, "unrecognized mount option '%s'", p); 749 btrfs_info(root->fs_info, "unrecognized mount option '%s'", p);
743 ret = -EINVAL; 750 ret = -EINVAL;
@@ -1189,6 +1196,12 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
1189 seq_puts(seq, ",fatal_errors=panic"); 1196 seq_puts(seq, ",fatal_errors=panic");
1190 if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) 1197 if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
1191 seq_printf(seq, ",commit=%d", info->commit_interval); 1198 seq_printf(seq, ",commit=%d", info->commit_interval);
1199#ifdef CONFIG_BTRFS_DEBUG
1200 if (btrfs_test_opt(root, FRAGMENT_DATA))
1201 seq_puts(seq, ",fragment=data");
1202 if (btrfs_test_opt(root, FRAGMENT_METADATA))
1203 seq_puts(seq, ",fragment=metadata");
1204#endif
1192 seq_printf(seq, ",subvolid=%llu", 1205 seq_printf(seq, ",subvolid=%llu",
1193 BTRFS_I(d_inode(dentry))->root->root_key.objectid); 1206 BTRFS_I(d_inode(dentry))->root->root_key.objectid);
1194 seq_puts(seq, ",subvol="); 1207 seq_puts(seq, ",subvol=");
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 603b0cc2b9bb..e0ac85949067 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -437,24 +437,24 @@ static const struct attribute *btrfs_attrs[] = {
437 NULL, 437 NULL,
438}; 438};
439 439
440static void btrfs_release_super_kobj(struct kobject *kobj) 440static void btrfs_release_fsid_kobj(struct kobject *kobj)
441{ 441{
442 struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); 442 struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj);
443 443
444 memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); 444 memset(&fs_devs->fsid_kobj, 0, sizeof(struct kobject));
445 complete(&fs_devs->kobj_unregister); 445 complete(&fs_devs->kobj_unregister);
446} 446}
447 447
448static struct kobj_type btrfs_ktype = { 448static struct kobj_type btrfs_ktype = {
449 .sysfs_ops = &kobj_sysfs_ops, 449 .sysfs_ops = &kobj_sysfs_ops,
450 .release = btrfs_release_super_kobj, 450 .release = btrfs_release_fsid_kobj,
451}; 451};
452 452
453static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) 453static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj)
454{ 454{
455 if (kobj->ktype != &btrfs_ktype) 455 if (kobj->ktype != &btrfs_ktype)
456 return NULL; 456 return NULL;
457 return container_of(kobj, struct btrfs_fs_devices, super_kobj); 457 return container_of(kobj, struct btrfs_fs_devices, fsid_kobj);
458} 458}
459 459
460static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) 460static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj)
@@ -502,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add)
502 attrs[0] = &fa->kobj_attr.attr; 502 attrs[0] = &fa->kobj_attr.attr;
503 if (add) { 503 if (add) {
504 int ret; 504 int ret;
505 ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, 505 ret = sysfs_merge_group(&fs_info->fs_devices->fsid_kobj,
506 &agroup); 506 &agroup);
507 if (ret) 507 if (ret)
508 return ret; 508 return ret;
509 } else 509 } else
510 sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, 510 sysfs_unmerge_group(&fs_info->fs_devices->fsid_kobj,
511 &agroup); 511 &agroup);
512 } 512 }
513 513
@@ -523,9 +523,9 @@ static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
523 fs_devs->device_dir_kobj = NULL; 523 fs_devs->device_dir_kobj = NULL;
524 } 524 }
525 525
526 if (fs_devs->super_kobj.state_initialized) { 526 if (fs_devs->fsid_kobj.state_initialized) {
527 kobject_del(&fs_devs->super_kobj); 527 kobject_del(&fs_devs->fsid_kobj);
528 kobject_put(&fs_devs->super_kobj); 528 kobject_put(&fs_devs->fsid_kobj);
529 wait_for_completion(&fs_devs->kobj_unregister); 529 wait_for_completion(&fs_devs->kobj_unregister);
530 } 530 }
531} 531}
@@ -545,7 +545,7 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs)
545 } 545 }
546} 546}
547 547
548void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) 548void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info)
549{ 549{
550 btrfs_reset_fs_info_ptr(fs_info); 550 btrfs_reset_fs_info_ptr(fs_info);
551 551
@@ -555,9 +555,9 @@ void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info)
555 kobject_put(fs_info->space_info_kobj); 555 kobject_put(fs_info->space_info_kobj);
556 } 556 }
557 addrm_unknown_feature_attrs(fs_info, false); 557 addrm_unknown_feature_attrs(fs_info, false);
558 sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); 558 sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group);
559 sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); 559 sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs);
560 btrfs_kobj_rm_device(fs_info->fs_devices, NULL); 560 btrfs_sysfs_rm_device_link(fs_info->fs_devices, NULL);
561} 561}
562 562
563const char * const btrfs_feature_set_names[3] = { 563const char * const btrfs_feature_set_names[3] = {
@@ -637,7 +637,7 @@ static void init_feature_attrs(void)
637 637
638/* when one_device is NULL, it removes all device links */ 638/* when one_device is NULL, it removes all device links */
639 639
640int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, 640int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
641 struct btrfs_device *one_device) 641 struct btrfs_device *one_device)
642{ 642{
643 struct hd_struct *disk; 643 struct hd_struct *disk;
@@ -675,7 +675,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
675{ 675{
676 if (!fs_devs->device_dir_kobj) 676 if (!fs_devs->device_dir_kobj)
677 fs_devs->device_dir_kobj = kobject_create_and_add("devices", 677 fs_devs->device_dir_kobj = kobject_create_and_add("devices",
678 &fs_devs->super_kobj); 678 &fs_devs->fsid_kobj);
679 679
680 if (!fs_devs->device_dir_kobj) 680 if (!fs_devs->device_dir_kobj)
681 return -ENOMEM; 681 return -ENOMEM;
@@ -683,7 +683,7 @@ int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs)
683 return 0; 683 return 0;
684} 684}
685 685
686int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, 686int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
687 struct btrfs_device *one_device) 687 struct btrfs_device *one_device)
688{ 688{
689 int error = 0; 689 int error = 0;
@@ -730,31 +730,31 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
730 int error; 730 int error;
731 731
732 init_completion(&fs_devs->kobj_unregister); 732 init_completion(&fs_devs->kobj_unregister);
733 fs_devs->super_kobj.kset = btrfs_kset; 733 fs_devs->fsid_kobj.kset = btrfs_kset;
734 error = kobject_init_and_add(&fs_devs->super_kobj, 734 error = kobject_init_and_add(&fs_devs->fsid_kobj,
735 &btrfs_ktype, parent, "%pU", fs_devs->fsid); 735 &btrfs_ktype, parent, "%pU", fs_devs->fsid);
736 return error; 736 return error;
737} 737}
738 738
739int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) 739int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info)
740{ 740{
741 int error; 741 int error;
742 struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; 742 struct btrfs_fs_devices *fs_devs = fs_info->fs_devices;
743 struct kobject *super_kobj = &fs_devs->super_kobj; 743 struct kobject *fsid_kobj = &fs_devs->fsid_kobj;
744 744
745 btrfs_set_fs_info_ptr(fs_info); 745 btrfs_set_fs_info_ptr(fs_info);
746 746
747 error = btrfs_kobj_add_device(fs_devs, NULL); 747 error = btrfs_sysfs_add_device_link(fs_devs, NULL);
748 if (error) 748 if (error)
749 return error; 749 return error;
750 750
751 error = sysfs_create_files(super_kobj, btrfs_attrs); 751 error = sysfs_create_files(fsid_kobj, btrfs_attrs);
752 if (error) { 752 if (error) {
753 btrfs_kobj_rm_device(fs_devs, NULL); 753 btrfs_sysfs_rm_device_link(fs_devs, NULL);
754 return error; 754 return error;
755 } 755 }
756 756
757 error = sysfs_create_group(super_kobj, 757 error = sysfs_create_group(fsid_kobj,
758 &btrfs_feature_attr_group); 758 &btrfs_feature_attr_group);
759 if (error) 759 if (error)
760 goto failure; 760 goto failure;
@@ -764,7 +764,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
764 goto failure; 764 goto failure;
765 765
766 fs_info->space_info_kobj = kobject_create_and_add("allocation", 766 fs_info->space_info_kobj = kobject_create_and_add("allocation",
767 super_kobj); 767 fsid_kobj);
768 if (!fs_info->space_info_kobj) { 768 if (!fs_info->space_info_kobj) {
769 error = -ENOMEM; 769 error = -ENOMEM;
770 goto failure; 770 goto failure;
@@ -776,7 +776,7 @@ int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info)
776 776
777 return 0; 777 return 0;
778failure: 778failure:
779 btrfs_sysfs_remove_one(fs_info); 779 btrfs_sysfs_remove_mounted(fs_info);
780 return error; 780 return error;
781} 781}
782 782
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 6392527bcc15..9c09522125a6 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -82,9 +82,9 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags);
82extern const char * const btrfs_feature_set_names[3]; 82extern const char * const btrfs_feature_set_names[3];
83extern struct kobj_type space_info_ktype; 83extern struct kobj_type space_info_ktype;
84extern struct kobj_type btrfs_raid_ktype; 84extern struct kobj_type btrfs_raid_ktype;
85int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, 85int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
86 struct btrfs_device *one_device); 86 struct btrfs_device *one_device);
87int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, 87int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices,
88 struct btrfs_device *one_device); 88 struct btrfs_device *one_device);
89int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, 89int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
90 struct kobject *parent); 90 struct kobject *parent);
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 2299bfde39ee..c8c3d70c31ff 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -19,6 +19,7 @@
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include "btrfs-tests.h" 20#include "btrfs-tests.h"
21#include "../ctree.h" 21#include "../ctree.h"
22#include "../disk-io.h"
22#include "../free-space-cache.h" 23#include "../free-space-cache.h"
23 24
24#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 25#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
@@ -35,6 +36,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
35 kfree(cache); 36 kfree(cache);
36 return NULL; 37 return NULL;
37 } 38 }
39 cache->fs_info = btrfs_alloc_dummy_fs_info();
40 if (!cache->fs_info) {
41 kfree(cache->free_space_ctl);
42 kfree(cache);
43 return NULL;
44 }
38 45
39 cache->key.objectid = 0; 46 cache->key.objectid = 0;
40 cache->key.offset = 1024 * 1024 * 1024; 47 cache->key.offset = 1024 * 1024 * 1024;
@@ -879,7 +886,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
879int btrfs_test_free_space_cache(void) 886int btrfs_test_free_space_cache(void)
880{ 887{
881 struct btrfs_block_group_cache *cache; 888 struct btrfs_block_group_cache *cache;
882 int ret; 889 struct btrfs_root *root = NULL;
890 int ret = -ENOMEM;
883 891
884 test_msg("Running btrfs free space cache tests\n"); 892 test_msg("Running btrfs free space cache tests\n");
885 893
@@ -889,6 +897,17 @@ int btrfs_test_free_space_cache(void)
889 return 0; 897 return 0;
890 } 898 }
891 899
900 root = btrfs_alloc_dummy_root();
901 if (!root)
902 goto out;
903
904 root->fs_info = btrfs_alloc_dummy_fs_info();
905 if (!root->fs_info)
906 goto out;
907
908 root->fs_info->extent_root = root;
909 cache->fs_info = root->fs_info;
910
892 ret = test_extents(cache); 911 ret = test_extents(cache);
893 if (ret) 912 if (ret)
894 goto out; 913 goto out;
@@ -904,6 +923,7 @@ out:
904 __btrfs_remove_free_space_cache(cache->free_space_ctl); 923 __btrfs_remove_free_space_cache(cache->free_space_ctl);
905 kfree(cache->free_space_ctl); 924 kfree(cache->free_space_ctl);
906 kfree(cache); 925 kfree(cache);
926 btrfs_free_dummy_root(root);
907 test_msg("Free space cache tests finished\n"); 927 test_msg("Free space cache tests finished\n");
908 return ret; 928 return ret;
909} 929}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a5b06442f0bf..418c6a2ad7d8 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -82,6 +82,12 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
82static void clear_btree_io_tree(struct extent_io_tree *tree) 82static void clear_btree_io_tree(struct extent_io_tree *tree)
83{ 83{
84 spin_lock(&tree->lock); 84 spin_lock(&tree->lock);
85 /*
86 * Do a single barrier for the waitqueue_active check here, the state
87 * of the waitqueue should not change once clear_btree_io_tree is
88 * called.
89 */
90 smp_mb();
85 while (!RB_EMPTY_ROOT(&tree->state)) { 91 while (!RB_EMPTY_ROOT(&tree->state)) {
86 struct rb_node *node; 92 struct rb_node *node;
87 struct extent_state *state; 93 struct extent_state *state;
@@ -226,25 +232,22 @@ loop:
226 extwriter_counter_init(cur_trans, type); 232 extwriter_counter_init(cur_trans, type);
227 init_waitqueue_head(&cur_trans->writer_wait); 233 init_waitqueue_head(&cur_trans->writer_wait);
228 init_waitqueue_head(&cur_trans->commit_wait); 234 init_waitqueue_head(&cur_trans->commit_wait);
235 init_waitqueue_head(&cur_trans->pending_wait);
229 cur_trans->state = TRANS_STATE_RUNNING; 236 cur_trans->state = TRANS_STATE_RUNNING;
230 /* 237 /*
231 * One for this trans handle, one so it will live on until we 238 * One for this trans handle, one so it will live on until we
232 * commit the transaction. 239 * commit the transaction.
233 */ 240 */
234 atomic_set(&cur_trans->use_count, 2); 241 atomic_set(&cur_trans->use_count, 2);
235 cur_trans->have_free_bgs = 0; 242 atomic_set(&cur_trans->pending_ordered, 0);
243 cur_trans->flags = 0;
236 cur_trans->start_time = get_seconds(); 244 cur_trans->start_time = get_seconds();
237 cur_trans->dirty_bg_run = 0; 245
246 memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
238 247
239 cur_trans->delayed_refs.href_root = RB_ROOT; 248 cur_trans->delayed_refs.href_root = RB_ROOT;
240 cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; 249 cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
241 atomic_set(&cur_trans->delayed_refs.num_entries, 0); 250 atomic_set(&cur_trans->delayed_refs.num_entries, 0);
242 cur_trans->delayed_refs.num_heads_ready = 0;
243 cur_trans->delayed_refs.pending_csums = 0;
244 cur_trans->delayed_refs.num_heads = 0;
245 cur_trans->delayed_refs.flushing = 0;
246 cur_trans->delayed_refs.run_delayed_start = 0;
247 cur_trans->delayed_refs.qgroup_to_skip = 0;
248 251
249 /* 252 /*
250 * although the tree mod log is per file system and not per transaction, 253 * although the tree mod log is per file system and not per transaction,
@@ -264,7 +267,6 @@ loop:
264 INIT_LIST_HEAD(&cur_trans->pending_snapshots); 267 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
265 INIT_LIST_HEAD(&cur_trans->pending_chunks); 268 INIT_LIST_HEAD(&cur_trans->pending_chunks);
266 INIT_LIST_HEAD(&cur_trans->switch_commits); 269 INIT_LIST_HEAD(&cur_trans->switch_commits);
267 INIT_LIST_HEAD(&cur_trans->pending_ordered);
268 INIT_LIST_HEAD(&cur_trans->dirty_bgs); 270 INIT_LIST_HEAD(&cur_trans->dirty_bgs);
269 INIT_LIST_HEAD(&cur_trans->io_bgs); 271 INIT_LIST_HEAD(&cur_trans->io_bgs);
270 INIT_LIST_HEAD(&cur_trans->dropped_roots); 272 INIT_LIST_HEAD(&cur_trans->dropped_roots);
@@ -447,8 +449,8 @@ static inline bool need_reserve_reloc_root(struct btrfs_root *root)
447} 449}
448 450
449static struct btrfs_trans_handle * 451static struct btrfs_trans_handle *
450start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, 452start_transaction(struct btrfs_root *root, unsigned int num_items,
451 enum btrfs_reserve_flush_enum flush) 453 unsigned int type, enum btrfs_reserve_flush_enum flush)
452{ 454{
453 struct btrfs_trans_handle *h; 455 struct btrfs_trans_handle *h;
454 struct btrfs_transaction *cur_trans; 456 struct btrfs_transaction *cur_trans;
@@ -478,13 +480,10 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
478 * the appropriate flushing if need be. 480 * the appropriate flushing if need be.
479 */ 481 */
480 if (num_items > 0 && root != root->fs_info->chunk_root) { 482 if (num_items > 0 && root != root->fs_info->chunk_root) {
481 if (root->fs_info->quota_enabled && 483 qgroup_reserved = num_items * root->nodesize;
482 is_fstree(root->root_key.objectid)) { 484 ret = btrfs_qgroup_reserve_meta(root, qgroup_reserved);
483 qgroup_reserved = num_items * root->nodesize; 485 if (ret)
484 ret = btrfs_qgroup_reserve(root, qgroup_reserved); 486 return ERR_PTR(ret);
485 if (ret)
486 return ERR_PTR(ret);
487 }
488 487
489 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 488 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
490 /* 489 /*
@@ -502,7 +501,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
502 goto reserve_fail; 501 goto reserve_fail;
503 } 502 }
504again: 503again:
505 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 504 h = kmem_cache_zalloc(btrfs_trans_handle_cachep, GFP_NOFS);
506 if (!h) { 505 if (!h) {
507 ret = -ENOMEM; 506 ret = -ENOMEM;
508 goto alloc_fail; 507 goto alloc_fail;
@@ -543,26 +542,13 @@ again:
543 542
544 h->transid = cur_trans->transid; 543 h->transid = cur_trans->transid;
545 h->transaction = cur_trans; 544 h->transaction = cur_trans;
546 h->blocks_used = 0;
547 h->bytes_reserved = 0;
548 h->chunk_bytes_reserved = 0;
549 h->root = root; 545 h->root = root;
550 h->delayed_ref_updates = 0;
551 h->use_count = 1; 546 h->use_count = 1;
552 h->adding_csums = 0; 547
553 h->block_rsv = NULL;
554 h->orig_rsv = NULL;
555 h->aborted = 0;
556 h->qgroup_reserved = 0;
557 h->delayed_ref_elem.seq = 0;
558 h->type = type; 548 h->type = type;
559 h->allocating_chunk = false;
560 h->can_flush_pending_bgs = true; 549 h->can_flush_pending_bgs = true;
561 h->reloc_reserved = false;
562 h->sync = false;
563 INIT_LIST_HEAD(&h->qgroup_ref_list); 550 INIT_LIST_HEAD(&h->qgroup_ref_list);
564 INIT_LIST_HEAD(&h->new_bgs); 551 INIT_LIST_HEAD(&h->new_bgs);
565 INIT_LIST_HEAD(&h->ordered);
566 552
567 smp_mb(); 553 smp_mb();
568 if (cur_trans->state >= TRANS_STATE_BLOCKED && 554 if (cur_trans->state >= TRANS_STATE_BLOCKED &&
@@ -579,7 +565,6 @@ again:
579 h->bytes_reserved = num_bytes; 565 h->bytes_reserved = num_bytes;
580 h->reloc_reserved = reloc_reserved; 566 h->reloc_reserved = reloc_reserved;
581 } 567 }
582 h->qgroup_reserved = qgroup_reserved;
583 568
584got_it: 569got_it:
585 btrfs_record_root_in_trans(h, root); 570 btrfs_record_root_in_trans(h, root);
@@ -597,20 +582,20 @@ alloc_fail:
597 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv, 582 btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
598 num_bytes); 583 num_bytes);
599reserve_fail: 584reserve_fail:
600 if (qgroup_reserved) 585 btrfs_qgroup_free_meta(root, qgroup_reserved);
601 btrfs_qgroup_free(root, qgroup_reserved);
602 return ERR_PTR(ret); 586 return ERR_PTR(ret);
603} 587}
604 588
605struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 589struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
606 int num_items) 590 unsigned int num_items)
607{ 591{
608 return start_transaction(root, num_items, TRANS_START, 592 return start_transaction(root, num_items, TRANS_START,
609 BTRFS_RESERVE_FLUSH_ALL); 593 BTRFS_RESERVE_FLUSH_ALL);
610} 594}
611 595
612struct btrfs_trans_handle *btrfs_start_transaction_lflush( 596struct btrfs_trans_handle *btrfs_start_transaction_lflush(
613 struct btrfs_root *root, int num_items) 597 struct btrfs_root *root,
598 unsigned int num_items)
614{ 599{
615 return start_transaction(root, num_items, TRANS_START, 600 return start_transaction(root, num_items, TRANS_START,
616 BTRFS_RESERVE_FLUSH_LIMIT); 601 BTRFS_RESERVE_FLUSH_LIMIT);
@@ -794,12 +779,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
794 if (!list_empty(&trans->new_bgs)) 779 if (!list_empty(&trans->new_bgs))
795 btrfs_create_pending_block_groups(trans, root); 780 btrfs_create_pending_block_groups(trans, root);
796 781
797 if (!list_empty(&trans->ordered)) {
798 spin_lock(&info->trans_lock);
799 list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
800 spin_unlock(&info->trans_lock);
801 }
802
803 trans->delayed_ref_updates = 0; 782 trans->delayed_ref_updates = 0;
804 if (!trans->sync) { 783 if (!trans->sync) {
805 must_run_delayed_refs = 784 must_run_delayed_refs =
@@ -815,15 +794,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
815 must_run_delayed_refs = 2; 794 must_run_delayed_refs = 2;
816 } 795 }
817 796
818 if (trans->qgroup_reserved) {
819 /*
820 * the same root has to be passed here between start_transaction
821 * and end_transaction. Subvolume quota depends on this.
822 */
823 btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
824 trans->qgroup_reserved = 0;
825 }
826
827 btrfs_trans_release_metadata(trans, root); 797 btrfs_trans_release_metadata(trans, root);
828 trans->block_rsv = NULL; 798 trans->block_rsv = NULL;
829 799
@@ -856,6 +826,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
856 atomic_dec(&cur_trans->num_writers); 826 atomic_dec(&cur_trans->num_writers);
857 extwriter_counter_dec(cur_trans, trans->type); 827 extwriter_counter_dec(cur_trans, trans->type);
858 828
829 /*
830 * Make sure counter is updated before we wake up waiters.
831 */
859 smp_mb(); 832 smp_mb();
860 if (waitqueue_active(&cur_trans->writer_wait)) 833 if (waitqueue_active(&cur_trans->writer_wait))
861 wake_up(&cur_trans->writer_wait); 834 wake_up(&cur_trans->writer_wait);
@@ -1238,6 +1211,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
1238 spin_lock(&fs_info->fs_roots_radix_lock); 1211 spin_lock(&fs_info->fs_roots_radix_lock);
1239 if (err) 1212 if (err)
1240 break; 1213 break;
1214 btrfs_qgroup_free_meta_all(root);
1241 } 1215 }
1242 } 1216 }
1243 spin_unlock(&fs_info->fs_roots_radix_lock); 1217 spin_unlock(&fs_info->fs_roots_radix_lock);
@@ -1795,25 +1769,10 @@ static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1795} 1769}
1796 1770
1797static inline void 1771static inline void
1798btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans, 1772btrfs_wait_pending_ordered(struct btrfs_transaction *cur_trans)
1799 struct btrfs_fs_info *fs_info)
1800{ 1773{
1801 struct btrfs_ordered_extent *ordered; 1774 wait_event(cur_trans->pending_wait,
1802 1775 atomic_read(&cur_trans->pending_ordered) == 0);
1803 spin_lock(&fs_info->trans_lock);
1804 while (!list_empty(&cur_trans->pending_ordered)) {
1805 ordered = list_first_entry(&cur_trans->pending_ordered,
1806 struct btrfs_ordered_extent,
1807 trans_list);
1808 list_del_init(&ordered->trans_list);
1809 spin_unlock(&fs_info->trans_lock);
1810
1811 wait_event(ordered->wait, test_bit(BTRFS_ORDERED_COMPLETE,
1812 &ordered->flags));
1813 btrfs_put_ordered_extent(ordered);
1814 spin_lock(&fs_info->trans_lock);
1815 }
1816 spin_unlock(&fs_info->trans_lock);
1817} 1776}
1818 1777
1819int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1778int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -1842,10 +1801,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1842 1801
1843 btrfs_trans_release_metadata(trans, root); 1802 btrfs_trans_release_metadata(trans, root);
1844 trans->block_rsv = NULL; 1803 trans->block_rsv = NULL;
1845 if (trans->qgroup_reserved) {
1846 btrfs_qgroup_free(root, trans->qgroup_reserved);
1847 trans->qgroup_reserved = 0;
1848 }
1849 1804
1850 cur_trans = trans->transaction; 1805 cur_trans = trans->transaction;
1851 1806
@@ -1865,7 +1820,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1865 return ret; 1820 return ret;
1866 } 1821 }
1867 1822
1868 if (!cur_trans->dirty_bg_run) { 1823 if (!test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &cur_trans->flags)) {
1869 int run_it = 0; 1824 int run_it = 0;
1870 1825
1871 /* this mutex is also taken before trying to set 1826 /* this mutex is also taken before trying to set
@@ -1874,18 +1829,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1874 * after a extents from that block group have been 1829 * after a extents from that block group have been
1875 * allocated for cache files. btrfs_set_block_group_ro 1830 * allocated for cache files. btrfs_set_block_group_ro
1876 * will wait for the transaction to commit if it 1831 * will wait for the transaction to commit if it
1877 * finds dirty_bg_run = 1 1832 * finds BTRFS_TRANS_DIRTY_BG_RUN set.
1878 * 1833 *
1879 * The dirty_bg_run flag is also used to make sure only 1834 * The BTRFS_TRANS_DIRTY_BG_RUN flag is also used to make sure
1880 * one process starts all the block group IO. It wouldn't 1835 * only one process starts all the block group IO. It wouldn't
1881 * hurt to have more than one go through, but there's no 1836 * hurt to have more than one go through, but there's no
1882 * real advantage to it either. 1837 * real advantage to it either.
1883 */ 1838 */
1884 mutex_lock(&root->fs_info->ro_block_group_mutex); 1839 mutex_lock(&root->fs_info->ro_block_group_mutex);
1885 if (!cur_trans->dirty_bg_run) { 1840 if (!test_and_set_bit(BTRFS_TRANS_DIRTY_BG_RUN,
1841 &cur_trans->flags))
1886 run_it = 1; 1842 run_it = 1;
1887 cur_trans->dirty_bg_run = 1;
1888 }
1889 mutex_unlock(&root->fs_info->ro_block_group_mutex); 1843 mutex_unlock(&root->fs_info->ro_block_group_mutex);
1890 1844
1891 if (run_it) 1845 if (run_it)
@@ -1897,7 +1851,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1897 } 1851 }
1898 1852
1899 spin_lock(&root->fs_info->trans_lock); 1853 spin_lock(&root->fs_info->trans_lock);
1900 list_splice_init(&trans->ordered, &cur_trans->pending_ordered);
1901 if (cur_trans->state >= TRANS_STATE_COMMIT_START) { 1854 if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
1902 spin_unlock(&root->fs_info->trans_lock); 1855 spin_unlock(&root->fs_info->trans_lock);
1903 atomic_inc(&cur_trans->use_count); 1856 atomic_inc(&cur_trans->use_count);
@@ -1956,7 +1909,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1956 1909
1957 btrfs_wait_delalloc_flush(root->fs_info); 1910 btrfs_wait_delalloc_flush(root->fs_info);
1958 1911
1959 btrfs_wait_pending_ordered(cur_trans, root->fs_info); 1912 btrfs_wait_pending_ordered(cur_trans);
1960 1913
1961 btrfs_scrub_pause(root); 1914 btrfs_scrub_pause(root);
1962 /* 1915 /*
@@ -2136,7 +2089,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
2136 2089
2137 ret = btrfs_write_and_wait_transaction(trans, root); 2090 ret = btrfs_write_and_wait_transaction(trans, root);
2138 if (ret) { 2091 if (ret) {
2139 btrfs_error(root->fs_info, ret, 2092 btrfs_std_error(root->fs_info, ret,
2140 "Error while writing out transaction"); 2093 "Error while writing out transaction");
2141 mutex_unlock(&root->fs_info->tree_log_mutex); 2094 mutex_unlock(&root->fs_info->tree_log_mutex);
2142 goto scrub_continue; 2095 goto scrub_continue;
@@ -2156,7 +2109,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
2156 2109
2157 btrfs_finish_extent_commit(trans, root); 2110 btrfs_finish_extent_commit(trans, root);
2158 2111
2159 if (cur_trans->have_free_bgs) 2112 if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags))
2160 btrfs_clear_space_info_full(root->fs_info); 2113 btrfs_clear_space_info_full(root->fs_info);
2161 2114
2162 root->fs_info->last_trans_committed = cur_trans->transid; 2115 root->fs_info->last_trans_committed = cur_trans->transid;
@@ -2198,10 +2151,6 @@ cleanup_transaction:
2198 btrfs_trans_release_metadata(trans, root); 2151 btrfs_trans_release_metadata(trans, root);
2199 btrfs_trans_release_chunk_metadata(trans); 2152 btrfs_trans_release_chunk_metadata(trans);
2200 trans->block_rsv = NULL; 2153 trans->block_rsv = NULL;
2201 if (trans->qgroup_reserved) {
2202 btrfs_qgroup_free(root, trans->qgroup_reserved);
2203 trans->qgroup_reserved = 0;
2204 }
2205 btrfs_warn(root->fs_info, "Skipping commit of aborted transaction."); 2154 btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
2206 if (current->journal_info == trans) 2155 if (current->journal_info == trans)
2207 current->journal_info = NULL; 2156 current->journal_info = NULL;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index a994bb097ee5..b05b2f64d913 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -32,6 +32,10 @@ enum btrfs_trans_state {
32 TRANS_STATE_MAX = 6, 32 TRANS_STATE_MAX = 6,
33}; 33};
34 34
35#define BTRFS_TRANS_HAVE_FREE_BGS 0
36#define BTRFS_TRANS_DIRTY_BG_RUN 1
37#define BTRFS_TRANS_CACHE_ENOSPC 2
38
35struct btrfs_transaction { 39struct btrfs_transaction {
36 u64 transid; 40 u64 transid;
37 /* 41 /*
@@ -46,11 +50,9 @@ struct btrfs_transaction {
46 */ 50 */
47 atomic_t num_writers; 51 atomic_t num_writers;
48 atomic_t use_count; 52 atomic_t use_count;
53 atomic_t pending_ordered;
49 54
50 /* 55 unsigned long flags;
51 * true if there is free bgs operations in this transaction
52 */
53 int have_free_bgs;
54 56
55 /* Be protected by fs_info->trans_lock when we want to change it. */ 57 /* Be protected by fs_info->trans_lock when we want to change it. */
56 enum btrfs_trans_state state; 58 enum btrfs_trans_state state;
@@ -59,9 +61,9 @@ struct btrfs_transaction {
59 unsigned long start_time; 61 unsigned long start_time;
60 wait_queue_head_t writer_wait; 62 wait_queue_head_t writer_wait;
61 wait_queue_head_t commit_wait; 63 wait_queue_head_t commit_wait;
64 wait_queue_head_t pending_wait;
62 struct list_head pending_snapshots; 65 struct list_head pending_snapshots;
63 struct list_head pending_chunks; 66 struct list_head pending_chunks;
64 struct list_head pending_ordered;
65 struct list_head switch_commits; 67 struct list_head switch_commits;
66 struct list_head dirty_bgs; 68 struct list_head dirty_bgs;
67 struct list_head io_bgs; 69 struct list_head io_bgs;
@@ -80,7 +82,6 @@ struct btrfs_transaction {
80 spinlock_t dropped_roots_lock; 82 spinlock_t dropped_roots_lock;
81 struct btrfs_delayed_ref_root delayed_refs; 83 struct btrfs_delayed_ref_root delayed_refs;
82 int aborted; 84 int aborted;
83 int dirty_bg_run;
84}; 85};
85 86
86#define __TRANS_FREEZABLE (1U << 0) 87#define __TRANS_FREEZABLE (1U << 0)
@@ -107,7 +108,6 @@ struct btrfs_trans_handle {
107 u64 transid; 108 u64 transid;
108 u64 bytes_reserved; 109 u64 bytes_reserved;
109 u64 chunk_bytes_reserved; 110 u64 chunk_bytes_reserved;
110 u64 qgroup_reserved;
111 unsigned long use_count; 111 unsigned long use_count;
112 unsigned long blocks_reserved; 112 unsigned long blocks_reserved;
113 unsigned long blocks_used; 113 unsigned long blocks_used;
@@ -129,7 +129,6 @@ struct btrfs_trans_handle {
129 */ 129 */
130 struct btrfs_root *root; 130 struct btrfs_root *root;
131 struct seq_list delayed_ref_elem; 131 struct seq_list delayed_ref_elem;
132 struct list_head ordered;
133 struct list_head qgroup_ref_list; 132 struct list_head qgroup_ref_list;
134 struct list_head new_bgs; 133 struct list_head new_bgs;
135}; 134};
@@ -185,9 +184,10 @@ static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans)
185int btrfs_end_transaction(struct btrfs_trans_handle *trans, 184int btrfs_end_transaction(struct btrfs_trans_handle *trans,
186 struct btrfs_root *root); 185 struct btrfs_root *root);
187struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 186struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
188 int num_items); 187 unsigned int num_items);
189struct btrfs_trans_handle *btrfs_start_transaction_lflush( 188struct btrfs_trans_handle *btrfs_start_transaction_lflush(
190 struct btrfs_root *root, int num_items); 189 struct btrfs_root *root,
190 unsigned int num_items);
191struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root); 191struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
192struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root); 192struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
193struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root); 193struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1bbaace73383..323e12cc9d2f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -229,7 +229,9 @@ int btrfs_pin_log_trans(struct btrfs_root *root)
229void btrfs_end_log_trans(struct btrfs_root *root) 229void btrfs_end_log_trans(struct btrfs_root *root)
230{ 230{
231 if (atomic_dec_and_test(&root->log_writers)) { 231 if (atomic_dec_and_test(&root->log_writers)) {
232 smp_mb(); 232 /*
233 * Implicit memory barrier after atomic_dec_and_test
234 */
233 if (waitqueue_active(&root->log_writer_wait)) 235 if (waitqueue_active(&root->log_writer_wait))
234 wake_up(&root->log_writer_wait); 236 wake_up(&root->log_writer_wait);
235 } 237 }
@@ -691,7 +693,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
691 ret = btrfs_inc_extent_ref(trans, root, 693 ret = btrfs_inc_extent_ref(trans, root,
692 ins.objectid, ins.offset, 694 ins.objectid, ins.offset,
693 0, root->root_key.objectid, 695 0, root->root_key.objectid,
694 key->objectid, offset, 0); 696 key->objectid, offset);
695 if (ret) 697 if (ret)
696 goto out; 698 goto out;
697 } else { 699 } else {
@@ -2820,7 +2822,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2820 2822
2821 mutex_lock(&log_root_tree->log_mutex); 2823 mutex_lock(&log_root_tree->log_mutex);
2822 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2824 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
2823 smp_mb(); 2825 /*
2826 * Implicit memory barrier after atomic_dec_and_test
2827 */
2824 if (waitqueue_active(&log_root_tree->log_writer_wait)) 2828 if (waitqueue_active(&log_root_tree->log_writer_wait))
2825 wake_up(&log_root_tree->log_writer_wait); 2829 wake_up(&log_root_tree->log_writer_wait);
2826 } 2830 }
@@ -2950,6 +2954,9 @@ out_wake_log_root:
2950 atomic_set(&log_root_tree->log_commit[index2], 0); 2954 atomic_set(&log_root_tree->log_commit[index2], 0);
2951 mutex_unlock(&log_root_tree->log_mutex); 2955 mutex_unlock(&log_root_tree->log_mutex);
2952 2956
2957 /*
2958 * The barrier before waitqueue_active is implied by mutex_unlock
2959 */
2953 if (waitqueue_active(&log_root_tree->log_commit_wait[index2])) 2960 if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
2954 wake_up(&log_root_tree->log_commit_wait[index2]); 2961 wake_up(&log_root_tree->log_commit_wait[index2]);
2955out: 2962out:
@@ -2961,6 +2968,9 @@ out:
2961 atomic_set(&root->log_commit[index1], 0); 2968 atomic_set(&root->log_commit[index1], 0);
2962 mutex_unlock(&root->log_mutex); 2969 mutex_unlock(&root->log_mutex);
2963 2970
2971 /*
2972 * The barrier before waitqueue_active is implied by mutex_unlock
2973 */
2964 if (waitqueue_active(&root->log_commit_wait[index1])) 2974 if (waitqueue_active(&root->log_commit_wait[index1]))
2965 wake_up(&root->log_commit_wait[index1]); 2975 wake_up(&root->log_commit_wait[index1]);
2966 return ret; 2976 return ret;
@@ -5314,7 +5324,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5314 5324
5315 ret = walk_log_tree(trans, log_root_tree, &wc); 5325 ret = walk_log_tree(trans, log_root_tree, &wc);
5316 if (ret) { 5326 if (ret) {
5317 btrfs_error(fs_info, ret, "Failed to pin buffers while " 5327 btrfs_std_error(fs_info, ret, "Failed to pin buffers while "
5318 "recovering log root tree."); 5328 "recovering log root tree.");
5319 goto error; 5329 goto error;
5320 } 5330 }
@@ -5328,7 +5338,7 @@ again:
5328 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5338 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
5329 5339
5330 if (ret < 0) { 5340 if (ret < 0) {
5331 btrfs_error(fs_info, ret, 5341 btrfs_std_error(fs_info, ret,
5332 "Couldn't find tree log root."); 5342 "Couldn't find tree log root.");
5333 goto error; 5343 goto error;
5334 } 5344 }
@@ -5346,7 +5356,7 @@ again:
5346 log = btrfs_read_fs_root(log_root_tree, &found_key); 5356 log = btrfs_read_fs_root(log_root_tree, &found_key);
5347 if (IS_ERR(log)) { 5357 if (IS_ERR(log)) {
5348 ret = PTR_ERR(log); 5358 ret = PTR_ERR(log);
5349 btrfs_error(fs_info, ret, 5359 btrfs_std_error(fs_info, ret,
5350 "Couldn't read tree log root."); 5360 "Couldn't read tree log root.");
5351 goto error; 5361 goto error;
5352 } 5362 }
@@ -5361,7 +5371,7 @@ again:
5361 free_extent_buffer(log->node); 5371 free_extent_buffer(log->node);
5362 free_extent_buffer(log->commit_root); 5372 free_extent_buffer(log->commit_root);
5363 kfree(log); 5373 kfree(log);
5364 btrfs_error(fs_info, ret, "Couldn't read target root " 5374 btrfs_std_error(fs_info, ret, "Couldn't read target root "
5365 "for tree log recovery."); 5375 "for tree log recovery.");
5366 goto error; 5376 goto error;
5367 } 5377 }
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 6fc735869c18..17ed76d18eb6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -42,6 +42,82 @@
42#include "dev-replace.h" 42#include "dev-replace.h"
43#include "sysfs.h" 43#include "sysfs.h"
44 44
45const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
46 [BTRFS_RAID_RAID10] = {
47 .sub_stripes = 2,
48 .dev_stripes = 1,
49 .devs_max = 0, /* 0 == as many as possible */
50 .devs_min = 4,
51 .tolerated_failures = 1,
52 .devs_increment = 2,
53 .ncopies = 2,
54 },
55 [BTRFS_RAID_RAID1] = {
56 .sub_stripes = 1,
57 .dev_stripes = 1,
58 .devs_max = 2,
59 .devs_min = 2,
60 .tolerated_failures = 1,
61 .devs_increment = 2,
62 .ncopies = 2,
63 },
64 [BTRFS_RAID_DUP] = {
65 .sub_stripes = 1,
66 .dev_stripes = 2,
67 .devs_max = 1,
68 .devs_min = 1,
69 .tolerated_failures = 0,
70 .devs_increment = 1,
71 .ncopies = 2,
72 },
73 [BTRFS_RAID_RAID0] = {
74 .sub_stripes = 1,
75 .dev_stripes = 1,
76 .devs_max = 0,
77 .devs_min = 2,
78 .tolerated_failures = 0,
79 .devs_increment = 1,
80 .ncopies = 1,
81 },
82 [BTRFS_RAID_SINGLE] = {
83 .sub_stripes = 1,
84 .dev_stripes = 1,
85 .devs_max = 1,
86 .devs_min = 1,
87 .tolerated_failures = 0,
88 .devs_increment = 1,
89 .ncopies = 1,
90 },
91 [BTRFS_RAID_RAID5] = {
92 .sub_stripes = 1,
93 .dev_stripes = 1,
94 .devs_max = 0,
95 .devs_min = 2,
96 .tolerated_failures = 1,
97 .devs_increment = 1,
98 .ncopies = 2,
99 },
100 [BTRFS_RAID_RAID6] = {
101 .sub_stripes = 1,
102 .dev_stripes = 1,
103 .devs_max = 0,
104 .devs_min = 3,
105 .tolerated_failures = 2,
106 .devs_increment = 1,
107 .ncopies = 3,
108 },
109};
110
111const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
112 [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
113 [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1,
114 [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP,
115 [BTRFS_RAID_RAID0] = BTRFS_BLOCK_GROUP_RAID0,
116 [BTRFS_RAID_SINGLE] = 0,
117 [BTRFS_RAID_RAID5] = BTRFS_BLOCK_GROUP_RAID5,
118 [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
119};
120
45static int init_first_rw_device(struct btrfs_trans_handle *trans, 121static int init_first_rw_device(struct btrfs_trans_handle *trans,
46 struct btrfs_root *root, 122 struct btrfs_root *root,
47 struct btrfs_device *device); 123 struct btrfs_device *device);
@@ -198,7 +274,6 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
198 274
199 if (IS_ERR(*bdev)) { 275 if (IS_ERR(*bdev)) {
200 ret = PTR_ERR(*bdev); 276 ret = PTR_ERR(*bdev);
201 printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
202 goto error; 277 goto error;
203 } 278 }
204 279
@@ -211,8 +286,8 @@ btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
211 } 286 }
212 invalidate_bdev(*bdev); 287 invalidate_bdev(*bdev);
213 *bh = btrfs_read_dev_super(*bdev); 288 *bh = btrfs_read_dev_super(*bdev);
214 if (!*bh) { 289 if (IS_ERR(*bh)) {
215 ret = -EINVAL; 290 ret = PTR_ERR(*bh);
216 blkdev_put(*bdev, flags); 291 blkdev_put(*bdev, flags);
217 goto error; 292 goto error;
218 } 293 }
@@ -345,6 +420,9 @@ loop_lock:
345 pending = pending->bi_next; 420 pending = pending->bi_next;
346 cur->bi_next = NULL; 421 cur->bi_next = NULL;
347 422
423 /*
424 * atomic_dec_return implies a barrier for waitqueue_active
425 */
348 if (atomic_dec_return(&fs_info->nr_async_bios) < limit && 426 if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
349 waitqueue_active(&fs_info->async_submit_wait)) 427 waitqueue_active(&fs_info->async_submit_wait))
350 wake_up(&fs_info->async_submit_wait); 428 wake_up(&fs_info->async_submit_wait);
@@ -765,36 +843,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
765 843
766 mutex_lock(&fs_devices->device_list_mutex); 844 mutex_lock(&fs_devices->device_list_mutex);
767 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { 845 list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
768 struct btrfs_device *new_device; 846 btrfs_close_one_device(device);
769 struct rcu_string *name;
770
771 if (device->bdev)
772 fs_devices->open_devices--;
773
774 if (device->writeable &&
775 device->devid != BTRFS_DEV_REPLACE_DEVID) {
776 list_del_init(&device->dev_alloc_list);
777 fs_devices->rw_devices--;
778 }
779
780 if (device->missing)
781 fs_devices->missing_devices--;
782
783 new_device = btrfs_alloc_device(NULL, &device->devid,
784 device->uuid);
785 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
786
787 /* Safe because we are under uuid_mutex */
788 if (device->name) {
789 name = rcu_string_strdup(device->name->str, GFP_NOFS);
790 BUG_ON(!name); /* -ENOMEM */
791 rcu_assign_pointer(new_device->name, name);
792 }
793
794 list_replace_rcu(&device->dev_list, &new_device->dev_list);
795 new_device->fs_devices = device->fs_devices;
796
797 call_rcu(&device->rcu, free_device);
798 } 847 }
799 mutex_unlock(&fs_devices->device_list_mutex); 848 mutex_unlock(&fs_devices->device_list_mutex);
800 849
@@ -1402,7 +1451,7 @@ again:
1402 extent = btrfs_item_ptr(leaf, path->slots[0], 1451 extent = btrfs_item_ptr(leaf, path->slots[0],
1403 struct btrfs_dev_extent); 1452 struct btrfs_dev_extent);
1404 } else { 1453 } else {
1405 btrfs_error(root->fs_info, ret, "Slot search failed"); 1454 btrfs_std_error(root->fs_info, ret, "Slot search failed");
1406 goto out; 1455 goto out;
1407 } 1456 }
1408 1457
@@ -1410,10 +1459,10 @@ again:
1410 1459
1411 ret = btrfs_del_item(trans, root, path); 1460 ret = btrfs_del_item(trans, root, path);
1412 if (ret) { 1461 if (ret) {
1413 btrfs_error(root->fs_info, ret, 1462 btrfs_std_error(root->fs_info, ret,
1414 "Failed to remove dev extent item"); 1463 "Failed to remove dev extent item");
1415 } else { 1464 } else {
1416 trans->transaction->have_free_bgs = 1; 1465 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1417 } 1466 }
1418out: 1467out:
1419 btrfs_free_path(path); 1468 btrfs_free_path(path);
@@ -1801,7 +1850,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1801 if (device->bdev) { 1850 if (device->bdev) {
1802 device->fs_devices->open_devices--; 1851 device->fs_devices->open_devices--;
1803 /* remove sysfs entry */ 1852 /* remove sysfs entry */
1804 btrfs_kobj_rm_device(root->fs_info->fs_devices, device); 1853 btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
1805 } 1854 }
1806 1855
1807 call_rcu(&device->rcu, free_device); 1856 call_rcu(&device->rcu, free_device);
@@ -1924,7 +1973,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1924 if (srcdev->writeable) { 1973 if (srcdev->writeable) {
1925 fs_devices->rw_devices--; 1974 fs_devices->rw_devices--;
1926 /* zero out the old super if it is writable */ 1975 /* zero out the old super if it is writable */
1927 btrfs_scratch_superblock(srcdev); 1976 btrfs_scratch_superblocks(srcdev->bdev,
1977 rcu_str_deref(srcdev->name));
1928 } 1978 }
1929 1979
1930 if (srcdev->bdev) 1980 if (srcdev->bdev)
@@ -1971,10 +2021,11 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1971 WARN_ON(!tgtdev); 2021 WARN_ON(!tgtdev);
1972 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2022 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1973 2023
1974 btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); 2024 btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
1975 2025
1976 if (tgtdev->bdev) { 2026 if (tgtdev->bdev) {
1977 btrfs_scratch_superblock(tgtdev); 2027 btrfs_scratch_superblocks(tgtdev->bdev,
2028 rcu_str_deref(tgtdev->name));
1978 fs_info->fs_devices->open_devices--; 2029 fs_info->fs_devices->open_devices--;
1979 } 2030 }
1980 fs_info->fs_devices->num_devices--; 2031 fs_info->fs_devices->num_devices--;
@@ -2041,10 +2092,8 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
2041 } 2092 }
2042 } 2093 }
2043 2094
2044 if (!*device) { 2095 if (!*device)
2045 btrfs_err(root->fs_info, "no missing device found"); 2096 return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2046 return -ENOENT;
2047 }
2048 2097
2049 return 0; 2098 return 0;
2050 } else { 2099 } else {
@@ -2309,7 +2358,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2309 tmp + 1); 2358 tmp + 1);
2310 2359
2311 /* add sysfs device entry */ 2360 /* add sysfs device entry */
2312 btrfs_kobj_add_device(root->fs_info->fs_devices, device); 2361 btrfs_sysfs_add_device_link(root->fs_info->fs_devices, device);
2313 2362
2314 /* 2363 /*
2315 * we've got more storage, clear any full flags on the space 2364 * we've got more storage, clear any full flags on the space
@@ -2350,9 +2399,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2350 */ 2399 */
2351 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", 2400 snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2352 root->fs_info->fsid); 2401 root->fs_info->fsid);
2353 if (kobject_rename(&root->fs_info->fs_devices->super_kobj, 2402 if (kobject_rename(&root->fs_info->fs_devices->fsid_kobj,
2354 fsid_buf)) 2403 fsid_buf))
2355 pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); 2404 btrfs_warn(root->fs_info,
2405 "sysfs: failed to create fsid for sprout");
2356 } 2406 }
2357 2407
2358 root->fs_info->num_tolerated_disk_barrier_failures = 2408 root->fs_info->num_tolerated_disk_barrier_failures =
@@ -2368,7 +2418,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2368 2418
2369 ret = btrfs_relocate_sys_chunks(root); 2419 ret = btrfs_relocate_sys_chunks(root);
2370 if (ret < 0) 2420 if (ret < 0)
2371 btrfs_error(root->fs_info, ret, 2421 btrfs_std_error(root->fs_info, ret,
2372 "Failed to relocate sys chunks after " 2422 "Failed to relocate sys chunks after "
2373 "device initialization. This can be fixed " 2423 "device initialization. This can be fixed "
2374 "using the \"btrfs balance\" command."); 2424 "using the \"btrfs balance\" command.");
@@ -2388,7 +2438,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2388error_trans: 2438error_trans:
2389 btrfs_end_transaction(trans, root); 2439 btrfs_end_transaction(trans, root);
2390 rcu_string_free(device->name); 2440 rcu_string_free(device->name);
2391 btrfs_kobj_rm_device(root->fs_info->fs_devices, device); 2441 btrfs_sysfs_rm_device_link(root->fs_info->fs_devices, device);
2392 kfree(device); 2442 kfree(device);
2393error: 2443error:
2394 blkdev_put(bdev, FMODE_EXCL); 2444 blkdev_put(bdev, FMODE_EXCL);
@@ -2613,7 +2663,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2613 if (ret < 0) 2663 if (ret < 0)
2614 goto out; 2664 goto out;
2615 else if (ret > 0) { /* Logic error or corruption */ 2665 else if (ret > 0) { /* Logic error or corruption */
2616 btrfs_error(root->fs_info, -ENOENT, 2666 btrfs_std_error(root->fs_info, -ENOENT,
2617 "Failed lookup while freeing chunk."); 2667 "Failed lookup while freeing chunk.");
2618 ret = -ENOENT; 2668 ret = -ENOENT;
2619 goto out; 2669 goto out;
@@ -2621,7 +2671,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2621 2671
2622 ret = btrfs_del_item(trans, root, path); 2672 ret = btrfs_del_item(trans, root, path);
2623 if (ret < 0) 2673 if (ret < 0)
2624 btrfs_error(root->fs_info, ret, 2674 btrfs_std_error(root->fs_info, ret,
2625 "Failed to delete chunk item."); 2675 "Failed to delete chunk item.");
2626out: 2676out:
2627 btrfs_free_path(path); 2677 btrfs_free_path(path);
@@ -2806,7 +2856,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
2806 trans = btrfs_start_transaction(root, 0); 2856 trans = btrfs_start_transaction(root, 0);
2807 if (IS_ERR(trans)) { 2857 if (IS_ERR(trans)) {
2808 ret = PTR_ERR(trans); 2858 ret = PTR_ERR(trans);
2809 btrfs_std_error(root->fs_info, ret); 2859 btrfs_std_error(root->fs_info, ret, NULL);
2810 return ret; 2860 return ret;
2811 } 2861 }
2812 2862
@@ -3009,16 +3059,19 @@ static void update_balance_args(struct btrfs_balance_control *bctl)
3009 * (albeit full) chunks. 3059 * (albeit full) chunks.
3010 */ 3060 */
3011 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) && 3061 if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3062 !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3012 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3063 !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3013 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE; 3064 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3014 bctl->data.usage = 90; 3065 bctl->data.usage = 90;
3015 } 3066 }
3016 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) && 3067 if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3068 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3017 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3069 !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3018 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE; 3070 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3019 bctl->sys.usage = 90; 3071 bctl->sys.usage = 90;
3020 } 3072 }
3021 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) && 3073 if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3074 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3022 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) { 3075 !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3023 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE; 3076 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3024 bctl->meta.usage = 90; 3077 bctl->meta.usage = 90;
@@ -3074,13 +3127,46 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3074 struct btrfs_balance_args *bargs) 3127 struct btrfs_balance_args *bargs)
3075{ 3128{
3076 struct btrfs_block_group_cache *cache; 3129 struct btrfs_block_group_cache *cache;
3130 u64 chunk_used;
3131 u64 user_thresh_min;
3132 u64 user_thresh_max;
3133 int ret = 1;
3134
3135 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3136 chunk_used = btrfs_block_group_used(&cache->item);
3137
3138 if (bargs->usage_min == 0)
3139 user_thresh_min = 0;
3140 else
3141 user_thresh_min = div_factor_fine(cache->key.offset,
3142 bargs->usage_min);
3143
3144 if (bargs->usage_max == 0)
3145 user_thresh_max = 1;
3146 else if (bargs->usage_max > 100)
3147 user_thresh_max = cache->key.offset;
3148 else
3149 user_thresh_max = div_factor_fine(cache->key.offset,
3150 bargs->usage_max);
3151
3152 if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3153 ret = 0;
3154
3155 btrfs_put_block_group(cache);
3156 return ret;
3157}
3158
3159static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info,
3160 u64 chunk_offset, struct btrfs_balance_args *bargs)
3161{
3162 struct btrfs_block_group_cache *cache;
3077 u64 chunk_used, user_thresh; 3163 u64 chunk_used, user_thresh;
3078 int ret = 1; 3164 int ret = 1;
3079 3165
3080 cache = btrfs_lookup_block_group(fs_info, chunk_offset); 3166 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3081 chunk_used = btrfs_block_group_used(&cache->item); 3167 chunk_used = btrfs_block_group_used(&cache->item);
3082 3168
3083 if (bargs->usage == 0) 3169 if (bargs->usage_min == 0)
3084 user_thresh = 1; 3170 user_thresh = 1;
3085 else if (bargs->usage > 100) 3171 else if (bargs->usage > 100)
3086 user_thresh = cache->key.offset; 3172 user_thresh = cache->key.offset;
@@ -3170,6 +3256,19 @@ static int chunk_vrange_filter(struct extent_buffer *leaf,
3170 return 1; 3256 return 1;
3171} 3257}
3172 3258
3259static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3260 struct btrfs_chunk *chunk,
3261 struct btrfs_balance_args *bargs)
3262{
3263 int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3264
3265 if (bargs->stripes_min <= num_stripes
3266 && num_stripes <= bargs->stripes_max)
3267 return 0;
3268
3269 return 1;
3270}
3271
3173static int chunk_soft_convert_filter(u64 chunk_type, 3272static int chunk_soft_convert_filter(u64 chunk_type,
3174 struct btrfs_balance_args *bargs) 3273 struct btrfs_balance_args *bargs)
3175{ 3274{
@@ -3216,6 +3315,9 @@ static int should_balance_chunk(struct btrfs_root *root,
3216 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && 3315 if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3217 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) { 3316 chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
3218 return 0; 3317 return 0;
3318 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3319 chunk_usage_range_filter(bctl->fs_info, chunk_offset, bargs)) {
3320 return 0;
3219 } 3321 }
3220 3322
3221 /* devid filter */ 3323 /* devid filter */
@@ -3236,6 +3338,12 @@ static int should_balance_chunk(struct btrfs_root *root,
3236 return 0; 3338 return 0;
3237 } 3339 }
3238 3340
3341 /* stripes filter */
3342 if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3343 chunk_stripes_range_filter(leaf, chunk, bargs)) {
3344 return 0;
3345 }
3346
3239 /* soft profile changing mode */ 3347 /* soft profile changing mode */
3240 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && 3348 if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3241 chunk_soft_convert_filter(chunk_type, bargs)) { 3349 chunk_soft_convert_filter(chunk_type, bargs)) {
@@ -3250,6 +3358,16 @@ static int should_balance_chunk(struct btrfs_root *root,
3250 return 0; 3358 return 0;
3251 else 3359 else
3252 bargs->limit--; 3360 bargs->limit--;
3361 } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3362 /*
3363 * Same logic as the 'limit' filter; the minimum cannot be
3364 * determined here because we do not have the global informatoin
3365 * about the count of all chunks that satisfy the filters.
3366 */
3367 if (bargs->limit_max == 0)
3368 return 0;
3369 else
3370 bargs->limit_max--;
3253 } 3371 }
3254 3372
3255 return 1; 3373 return 1;
@@ -3264,6 +3382,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3264 struct btrfs_device *device; 3382 struct btrfs_device *device;
3265 u64 old_size; 3383 u64 old_size;
3266 u64 size_to_free; 3384 u64 size_to_free;
3385 u64 chunk_type;
3267 struct btrfs_chunk *chunk; 3386 struct btrfs_chunk *chunk;
3268 struct btrfs_path *path; 3387 struct btrfs_path *path;
3269 struct btrfs_key key; 3388 struct btrfs_key key;
@@ -3274,9 +3393,13 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3274 int ret; 3393 int ret;
3275 int enospc_errors = 0; 3394 int enospc_errors = 0;
3276 bool counting = true; 3395 bool counting = true;
3396 /* The single value limit and min/max limits use the same bytes in the */
3277 u64 limit_data = bctl->data.limit; 3397 u64 limit_data = bctl->data.limit;
3278 u64 limit_meta = bctl->meta.limit; 3398 u64 limit_meta = bctl->meta.limit;
3279 u64 limit_sys = bctl->sys.limit; 3399 u64 limit_sys = bctl->sys.limit;
3400 u32 count_data = 0;
3401 u32 count_meta = 0;
3402 u32 count_sys = 0;
3280 3403
3281 /* step one make some room on all the devices */ 3404 /* step one make some room on all the devices */
3282 devices = &fs_info->fs_devices->devices; 3405 devices = &fs_info->fs_devices->devices;
@@ -3317,6 +3440,10 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3317 spin_unlock(&fs_info->balance_lock); 3440 spin_unlock(&fs_info->balance_lock);
3318again: 3441again:
3319 if (!counting) { 3442 if (!counting) {
3443 /*
3444 * The single value limit and min/max limits use the same bytes
3445 * in the
3446 */
3320 bctl->data.limit = limit_data; 3447 bctl->data.limit = limit_data;
3321 bctl->meta.limit = limit_meta; 3448 bctl->meta.limit = limit_meta;
3322 bctl->sys.limit = limit_sys; 3449 bctl->sys.limit = limit_sys;
@@ -3364,6 +3491,7 @@ again:
3364 } 3491 }
3365 3492
3366 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); 3493 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3494 chunk_type = btrfs_chunk_type(leaf, chunk);
3367 3495
3368 if (!counting) { 3496 if (!counting) {
3369 spin_lock(&fs_info->balance_lock); 3497 spin_lock(&fs_info->balance_lock);
@@ -3384,6 +3512,28 @@ again:
3384 spin_lock(&fs_info->balance_lock); 3512 spin_lock(&fs_info->balance_lock);
3385 bctl->stat.expected++; 3513 bctl->stat.expected++;
3386 spin_unlock(&fs_info->balance_lock); 3514 spin_unlock(&fs_info->balance_lock);
3515
3516 if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3517 count_data++;
3518 else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3519 count_sys++;
3520 else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3521 count_meta++;
3522
3523 goto loop;
3524 }
3525
3526 /*
3527 * Apply limit_min filter, no need to check if the LIMITS
3528 * filter is used, limit_min is 0 by default
3529 */
3530 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3531 count_data < bctl->data.limit_min)
3532 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3533 count_meta < bctl->meta.limit_min)
3534 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3535 count_sys < bctl->sys.limit_min)) {
3536 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3387 goto loop; 3537 goto loop;
3388 } 3538 }
3389 3539
@@ -3461,11 +3611,20 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
3461 unset_balance_control(fs_info); 3611 unset_balance_control(fs_info);
3462 ret = del_balance_item(fs_info->tree_root); 3612 ret = del_balance_item(fs_info->tree_root);
3463 if (ret) 3613 if (ret)
3464 btrfs_std_error(fs_info, ret); 3614 btrfs_std_error(fs_info, ret, NULL);
3465 3615
3466 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3616 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3467} 3617}
3468 3618
3619/* Non-zero return value signifies invalidity */
3620static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3621 u64 allowed)
3622{
3623 return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3624 (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3625 (bctl_arg->target & ~allowed)));
3626}
3627
3469/* 3628/*
3470 * Should be called with both balance and volume mutexes held 3629 * Should be called with both balance and volume mutexes held
3471 */ 3630 */
@@ -3523,27 +3682,21 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3523 if (num_devices > 3) 3682 if (num_devices > 3)
3524 allowed |= (BTRFS_BLOCK_GROUP_RAID10 | 3683 allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3525 BTRFS_BLOCK_GROUP_RAID6); 3684 BTRFS_BLOCK_GROUP_RAID6);
3526 if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3685 if (validate_convert_profile(&bctl->data, allowed)) {
3527 (!alloc_profile_is_valid(bctl->data.target, 1) ||
3528 (bctl->data.target & ~allowed))) {
3529 btrfs_err(fs_info, "unable to start balance with target " 3686 btrfs_err(fs_info, "unable to start balance with target "
3530 "data profile %llu", 3687 "data profile %llu",
3531 bctl->data.target); 3688 bctl->data.target);
3532 ret = -EINVAL; 3689 ret = -EINVAL;
3533 goto out; 3690 goto out;
3534 } 3691 }
3535 if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3692 if (validate_convert_profile(&bctl->meta, allowed)) {
3536 (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3537 (bctl->meta.target & ~allowed))) {
3538 btrfs_err(fs_info, 3693 btrfs_err(fs_info,
3539 "unable to start balance with target metadata profile %llu", 3694 "unable to start balance with target metadata profile %llu",
3540 bctl->meta.target); 3695 bctl->meta.target);
3541 ret = -EINVAL; 3696 ret = -EINVAL;
3542 goto out; 3697 goto out;
3543 } 3698 }
3544 if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) && 3699 if (validate_convert_profile(&bctl->sys, allowed)) {
3545 (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3546 (bctl->sys.target & ~allowed))) {
3547 btrfs_err(fs_info, 3700 btrfs_err(fs_info,
3548 "unable to start balance with target system profile %llu", 3701 "unable to start balance with target system profile %llu",
3549 bctl->sys.target); 3702 bctl->sys.target);
@@ -4285,65 +4438,6 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
4285 return 0; 4438 return 0;
4286} 4439}
4287 4440
4288static const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
4289 [BTRFS_RAID_RAID10] = {
4290 .sub_stripes = 2,
4291 .dev_stripes = 1,
4292 .devs_max = 0, /* 0 == as many as possible */
4293 .devs_min = 4,
4294 .devs_increment = 2,
4295 .ncopies = 2,
4296 },
4297 [BTRFS_RAID_RAID1] = {
4298 .sub_stripes = 1,
4299 .dev_stripes = 1,
4300 .devs_max = 2,
4301 .devs_min = 2,
4302 .devs_increment = 2,
4303 .ncopies = 2,
4304 },
4305 [BTRFS_RAID_DUP] = {
4306 .sub_stripes = 1,
4307 .dev_stripes = 2,
4308 .devs_max = 1,
4309 .devs_min = 1,
4310 .devs_increment = 1,
4311 .ncopies = 2,
4312 },
4313 [BTRFS_RAID_RAID0] = {
4314 .sub_stripes = 1,
4315 .dev_stripes = 1,
4316 .devs_max = 0,
4317 .devs_min = 2,
4318 .devs_increment = 1,
4319 .ncopies = 1,
4320 },
4321 [BTRFS_RAID_SINGLE] = {
4322 .sub_stripes = 1,
4323 .dev_stripes = 1,
4324 .devs_max = 1,
4325 .devs_min = 1,
4326 .devs_increment = 1,
4327 .ncopies = 1,
4328 },
4329 [BTRFS_RAID_RAID5] = {
4330 .sub_stripes = 1,
4331 .dev_stripes = 1,
4332 .devs_max = 0,
4333 .devs_min = 2,
4334 .devs_increment = 1,
4335 .ncopies = 2,
4336 },
4337 [BTRFS_RAID_RAID6] = {
4338 .sub_stripes = 1,
4339 .dev_stripes = 1,
4340 .devs_max = 0,
4341 .devs_min = 3,
4342 .devs_increment = 1,
4343 .ncopies = 3,
4344 },
4345};
4346
4347static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) 4441static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
4348{ 4442{
4349 /* TODO allow them to set a preferred stripe size */ 4443 /* TODO allow them to set a preferred stripe size */
@@ -6594,8 +6688,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6594 BUG_ON(!path); 6688 BUG_ON(!path);
6595 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); 6689 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
6596 if (ret < 0) { 6690 if (ret < 0) {
6597 printk_in_rcu(KERN_WARNING "BTRFS: " 6691 btrfs_warn_in_rcu(dev_root->fs_info,
6598 "error %d while searching for dev_stats item for device %s!\n", 6692 "error %d while searching for dev_stats item for device %s",
6599 ret, rcu_str_deref(device->name)); 6693 ret, rcu_str_deref(device->name));
6600 goto out; 6694 goto out;
6601 } 6695 }
@@ -6605,8 +6699,8 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6605 /* need to delete old one and insert a new one */ 6699 /* need to delete old one and insert a new one */
6606 ret = btrfs_del_item(trans, dev_root, path); 6700 ret = btrfs_del_item(trans, dev_root, path);
6607 if (ret != 0) { 6701 if (ret != 0) {
6608 printk_in_rcu(KERN_WARNING "BTRFS: " 6702 btrfs_warn_in_rcu(dev_root->fs_info,
6609 "delete too small dev_stats item for device %s failed %d!\n", 6703 "delete too small dev_stats item for device %s failed %d",
6610 rcu_str_deref(device->name), ret); 6704 rcu_str_deref(device->name), ret);
6611 goto out; 6705 goto out;
6612 } 6706 }
@@ -6619,9 +6713,9 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6619 ret = btrfs_insert_empty_item(trans, dev_root, path, 6713 ret = btrfs_insert_empty_item(trans, dev_root, path,
6620 &key, sizeof(*ptr)); 6714 &key, sizeof(*ptr));
6621 if (ret < 0) { 6715 if (ret < 0) {
6622 printk_in_rcu(KERN_WARNING "BTRFS: " 6716 btrfs_warn_in_rcu(dev_root->fs_info,
6623 "insert dev_stats item for device %s failed %d!\n", 6717 "insert dev_stats item for device %s failed %d",
6624 rcu_str_deref(device->name), ret); 6718 rcu_str_deref(device->name), ret);
6625 goto out; 6719 goto out;
6626 } 6720 }
6627 } 6721 }
@@ -6675,8 +6769,8 @@ static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
6675{ 6769{
6676 if (!dev->dev_stats_valid) 6770 if (!dev->dev_stats_valid)
6677 return; 6771 return;
6678 printk_ratelimited_in_rcu(KERN_ERR "BTRFS: " 6772 btrfs_err_rl_in_rcu(dev->dev_root->fs_info,
6679 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6773 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
6680 rcu_str_deref(dev->name), 6774 rcu_str_deref(dev->name),
6681 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6775 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6682 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6776 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6695,8 +6789,8 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
6695 if (i == BTRFS_DEV_STAT_VALUES_MAX) 6789 if (i == BTRFS_DEV_STAT_VALUES_MAX)
6696 return; /* all values == 0, suppress message */ 6790 return; /* all values == 0, suppress message */
6697 6791
6698 printk_in_rcu(KERN_INFO "BTRFS: " 6792 btrfs_info_in_rcu(dev->dev_root->fs_info,
6699 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n", 6793 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
6700 rcu_str_deref(dev->name), 6794 rcu_str_deref(dev->name),
6701 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), 6795 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6702 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), 6796 btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
@@ -6740,22 +6834,34 @@ int btrfs_get_dev_stats(struct btrfs_root *root,
6740 return 0; 6834 return 0;
6741} 6835}
6742 6836
6743int btrfs_scratch_superblock(struct btrfs_device *device) 6837void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path)
6744{ 6838{
6745 struct buffer_head *bh; 6839 struct buffer_head *bh;
6746 struct btrfs_super_block *disk_super; 6840 struct btrfs_super_block *disk_super;
6841 int copy_num;
6747 6842
6748 bh = btrfs_read_dev_super(device->bdev); 6843 if (!bdev)
6749 if (!bh) 6844 return;
6750 return -EINVAL;
6751 disk_super = (struct btrfs_super_block *)bh->b_data;
6752 6845
6753 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 6846 for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
6754 set_buffer_dirty(bh); 6847 copy_num++) {
6755 sync_dirty_buffer(bh);
6756 brelse(bh);
6757 6848
6758 return 0; 6849 if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
6850 continue;
6851
6852 disk_super = (struct btrfs_super_block *)bh->b_data;
6853
6854 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
6855 set_buffer_dirty(bh);
6856 sync_dirty_buffer(bh);
6857 brelse(bh);
6858 }
6859
6860 /* Notify udev that device has changed */
6861 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
6862
6863 /* Update ctime/mtime for device path for libblkid */
6864 update_dev_time(device_path);
6759} 6865}
6760 6866
6761/* 6867/*
@@ -6823,3 +6929,38 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
6823 fs_devices = fs_devices->seed; 6929 fs_devices = fs_devices->seed;
6824 } 6930 }
6825} 6931}
6932
6933void btrfs_close_one_device(struct btrfs_device *device)
6934{
6935 struct btrfs_fs_devices *fs_devices = device->fs_devices;
6936 struct btrfs_device *new_device;
6937 struct rcu_string *name;
6938
6939 if (device->bdev)
6940 fs_devices->open_devices--;
6941
6942 if (device->writeable &&
6943 device->devid != BTRFS_DEV_REPLACE_DEVID) {
6944 list_del_init(&device->dev_alloc_list);
6945 fs_devices->rw_devices--;
6946 }
6947
6948 if (device->missing)
6949 fs_devices->missing_devices--;
6950
6951 new_device = btrfs_alloc_device(NULL, &device->devid,
6952 device->uuid);
6953 BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
6954
6955 /* Safe because we are under uuid_mutex */
6956 if (device->name) {
6957 name = rcu_string_strdup(device->name->str, GFP_NOFS);
6958 BUG_ON(!name); /* -ENOMEM */
6959 rcu_assign_pointer(new_device->name, name);
6960 }
6961
6962 list_replace_rcu(&device->dev_list, &new_device->dev_list);
6963 new_device->fs_devices = device->fs_devices;
6964
6965 call_rcu(&device->rcu, free_device);
6966}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 595279a8b99f..ec5712372732 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -256,7 +256,7 @@ struct btrfs_fs_devices {
256 256
257 struct btrfs_fs_info *fs_info; 257 struct btrfs_fs_info *fs_info;
258 /* sysfs kobjects */ 258 /* sysfs kobjects */
259 struct kobject super_kobj; 259 struct kobject fsid_kobj;
260 struct kobject *device_dir_kobj; 260 struct kobject *device_dir_kobj;
261 struct completion kobj_unregister; 261 struct completion kobj_unregister;
262}; 262};
@@ -334,10 +334,15 @@ struct btrfs_raid_attr {
334 int dev_stripes; /* stripes per dev */ 334 int dev_stripes; /* stripes per dev */
335 int devs_max; /* max devs to use */ 335 int devs_max; /* max devs to use */
336 int devs_min; /* min devs needed */ 336 int devs_min; /* min devs needed */
337 int tolerated_failures; /* max tolerated fail devs */
337 int devs_increment; /* ndevs has to be a multiple of this */ 338 int devs_increment; /* ndevs has to be a multiple of this */
338 int ncopies; /* how many copies to data has */ 339 int ncopies; /* how many copies to data has */
339}; 340};
340 341
342extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
343
344extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
345
341struct map_lookup { 346struct map_lookup {
342 u64 type; 347 u64 type;
343 int io_align; 348 int io_align;
@@ -375,6 +380,9 @@ struct map_lookup {
375#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3) 380#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
376#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4) 381#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
377#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5) 382#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
383#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
384#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
385#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 8)
378 386
379#define BTRFS_BALANCE_ARGS_MASK \ 387#define BTRFS_BALANCE_ARGS_MASK \
380 (BTRFS_BALANCE_ARGS_PROFILES | \ 388 (BTRFS_BALANCE_ARGS_PROFILES | \
@@ -382,7 +390,10 @@ struct map_lookup {
382 BTRFS_BALANCE_ARGS_DEVID | \ 390 BTRFS_BALANCE_ARGS_DEVID | \
383 BTRFS_BALANCE_ARGS_DRANGE | \ 391 BTRFS_BALANCE_ARGS_DRANGE | \
384 BTRFS_BALANCE_ARGS_VRANGE | \ 392 BTRFS_BALANCE_ARGS_VRANGE | \
385 BTRFS_BALANCE_ARGS_LIMIT) 393 BTRFS_BALANCE_ARGS_LIMIT | \
394 BTRFS_BALANCE_ARGS_LIMIT_RANGE | \
395 BTRFS_BALANCE_ARGS_STRIPES_RANGE | \
396 BTRFS_BALANCE_ARGS_USAGE_RANGE)
386 397
387/* 398/*
388 * Profile changing flags. When SOFT is set we won't relocate chunk if 399 * Profile changing flags. When SOFT is set we won't relocate chunk if
@@ -482,7 +493,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
482 struct btrfs_device *tgtdev); 493 struct btrfs_device *tgtdev);
483void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, 494void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
484 struct btrfs_device *tgtdev); 495 struct btrfs_device *tgtdev);
485int btrfs_scratch_superblock(struct btrfs_device *device); 496void btrfs_scratch_superblocks(struct block_device *bdev, char *device_path);
486int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, 497int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
487 u64 logical, u64 len, int mirror_num); 498 u64 logical, u64 len, int mirror_num);
488unsigned long btrfs_full_stripe_len(struct btrfs_root *root, 499unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
@@ -555,5 +566,6 @@ static inline void unlock_chunks(struct btrfs_root *root)
555struct list_head *btrfs_get_fs_uuids(void); 566struct list_head *btrfs_get_fs_uuids(void);
556void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); 567void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
557void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); 568void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
569void btrfs_close_one_device(struct btrfs_device *device);
558 570
559#endif 571#endif
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 0b73af9be12f..b4473dab39d6 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -1117,6 +1117,119 @@ DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
1117 TP_ARGS(wq) 1117 TP_ARGS(wq)
1118); 1118);
1119 1119
1120DECLARE_EVENT_CLASS(btrfs__qgroup_data_map,
1121
1122 TP_PROTO(struct inode *inode, u64 free_reserved),
1123
1124 TP_ARGS(inode, free_reserved),
1125
1126 TP_STRUCT__entry(
1127 __field( u64, rootid )
1128 __field( unsigned long, ino )
1129 __field( u64, free_reserved )
1130 ),
1131
1132 TP_fast_assign(
1133 __entry->rootid = BTRFS_I(inode)->root->objectid;
1134 __entry->ino = inode->i_ino;
1135 __entry->free_reserved = free_reserved;
1136 ),
1137
1138 TP_printk("rootid=%llu, ino=%lu, free_reserved=%llu",
1139 __entry->rootid, __entry->ino, __entry->free_reserved)
1140);
1141
1142DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_init_data_rsv_map,
1143
1144 TP_PROTO(struct inode *inode, u64 free_reserved),
1145
1146 TP_ARGS(inode, free_reserved)
1147);
1148
1149DEFINE_EVENT(btrfs__qgroup_data_map, btrfs_qgroup_free_data_rsv_map,
1150
1151 TP_PROTO(struct inode *inode, u64 free_reserved),
1152
1153 TP_ARGS(inode, free_reserved)
1154);
1155
1156#define BTRFS_QGROUP_OPERATIONS \
1157 { QGROUP_RESERVE, "reserve" }, \
1158 { QGROUP_RELEASE, "release" }, \
1159 { QGROUP_FREE, "free" }
1160
1161DECLARE_EVENT_CLASS(btrfs__qgroup_rsv_data,
1162
1163 TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
1164
1165 TP_ARGS(inode, start, len, reserved, op),
1166
1167 TP_STRUCT__entry(
1168 __field( u64, rootid )
1169 __field( unsigned long, ino )
1170 __field( u64, start )
1171 __field( u64, len )
1172 __field( u64, reserved )
1173 __field( int, op )
1174 ),
1175
1176 TP_fast_assign(
1177 __entry->rootid = BTRFS_I(inode)->root->objectid;
1178 __entry->ino = inode->i_ino;
1179 __entry->start = start;
1180 __entry->len = len;
1181 __entry->reserved = reserved;
1182 __entry->op = op;
1183 ),
1184
1185 TP_printk("root=%llu, ino=%lu, start=%llu, len=%llu, reserved=%llu, op=%s",
1186 __entry->rootid, __entry->ino, __entry->start, __entry->len,
1187 __entry->reserved,
1188 __print_flags((unsigned long)__entry->op, "",
1189 BTRFS_QGROUP_OPERATIONS)
1190 )
1191);
1192
1193DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_reserve_data,
1194
1195 TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
1196
1197 TP_ARGS(inode, start, len, reserved, op)
1198);
1199
1200DEFINE_EVENT(btrfs__qgroup_rsv_data, btrfs_qgroup_release_data,
1201
1202 TP_PROTO(struct inode *inode, u64 start, u64 len, u64 reserved, int op),
1203
1204 TP_ARGS(inode, start, len, reserved, op)
1205);
1206
1207DECLARE_EVENT_CLASS(btrfs__qgroup_delayed_ref,
1208
1209 TP_PROTO(u64 ref_root, u64 reserved),
1210
1211 TP_ARGS(ref_root, reserved),
1212
1213 TP_STRUCT__entry(
1214 __field( u64, ref_root )
1215 __field( u64, reserved )
1216 ),
1217
1218 TP_fast_assign(
1219 __entry->ref_root = ref_root;
1220 __entry->reserved = reserved;
1221 ),
1222
1223 TP_printk("root=%llu, reserved=%llu, op=free",
1224 __entry->ref_root, __entry->reserved)
1225);
1226
1227DEFINE_EVENT(btrfs__qgroup_delayed_ref, btrfs_qgroup_free_delayed_ref,
1228
1229 TP_PROTO(u64 ref_root, u64 reserved),
1230
1231 TP_ARGS(ref_root, reserved)
1232);
1120#endif /* _TRACE_BTRFS_H */ 1233#endif /* _TRACE_BTRFS_H */
1121 1234
1122/* This part must be outside protection */ 1235/* This part must be outside protection */
diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h
index b6dec05c7196..dea893199257 100644
--- a/include/uapi/linux/btrfs.h
+++ b/include/uapi/linux/btrfs.h
@@ -206,7 +206,13 @@ struct btrfs_ioctl_feature_flags {
206 */ 206 */
207struct btrfs_balance_args { 207struct btrfs_balance_args {
208 __u64 profiles; 208 __u64 profiles;
209 __u64 usage; 209 union {
210 __le64 usage;
211 struct {
212 __le32 usage_min;
213 __le32 usage_max;
214 };
215 };
210 __u64 devid; 216 __u64 devid;
211 __u64 pstart; 217 __u64 pstart;
212 __u64 pend; 218 __u64 pend;
@@ -217,8 +223,27 @@ struct btrfs_balance_args {
217 223
218 __u64 flags; 224 __u64 flags;
219 225
220 __u64 limit; /* limit number of processed chunks */ 226 /*
221 __u64 unused[7]; 227 * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
228 * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
229 * and maximum
230 */
231 union {
232 __u64 limit; /* limit number of processed chunks */
233 struct {
234 __u32 limit_min;
235 __u32 limit_max;
236 };
237 };
238
239 /*
240 * Process chunks that cross stripes_min..stripes_max devices,
241 * BTRFS_BALANCE_ARGS_STRIPES_RANGE
242 */
243 __le32 stripes_min;
244 __le32 stripes_max;
245
246 __u64 unused[6];
222} __attribute__ ((__packed__)); 247} __attribute__ ((__packed__));
223 248
224/* report balance progress to userspace */ 249/* report balance progress to userspace */