aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-05-21 13:49:22 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-21 13:49:22 -0400
commit07be1337b9e8bfcd855c6e9175b5066a30ac609b (patch)
treee40ad01dc89f6eb17d461939b809fea3387fc2a5 /fs/btrfs
parent63d222b9d277c4d7bf08afd1631a7f8e327a825c (diff)
parentc315ef8d9db7f1a0ebd023a395ebdfde1c68057e (diff)
Merge branch 'for-linus-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
Pull btrfs updates from Chris Mason: "This has our merge window series of cleanups and fixes. These target a wide range of issues, but do include some important fixes for qgroups, O_DIRECT, and fsync handling. Jeff Mahoney moved around a few definitions to make them easier for userland to consume. Also whiteout support is included now that issues with overlayfs have been cleared up. I have one more fix pending for page faults during btrfs_copy_from_user, but I wanted to get this bulk out the door first" * 'for-linus-4.7' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (90 commits) btrfs: fix memory leak during RAID 5/6 device replacement Btrfs: add semaphore to synchronize direct IO writes with fsync Btrfs: fix race between block group relocation and nocow writes Btrfs: fix race between fsync and direct IO writes for prealloc extents Btrfs: fix number of transaction units for renames with whiteout Btrfs: pin logs earlier when doing a rename exchange operation Btrfs: unpin logs if rename exchange operation fails Btrfs: fix inode leak on failure to setup whiteout inode in rename btrfs: add support for RENAME_EXCHANGE and RENAME_WHITEOUT Btrfs: pin log earlier when renaming Btrfs: unpin log if rename operation fails Btrfs: don't do unnecessary delalloc flushes when relocating Btrfs: don't wait for unrelated IO to finish before relocation Btrfs: fix empty symlink after creating symlink and fsync parent dir Btrfs: fix for incorrect directory entries after fsync log replay btrfs: build fixup for qgroup_account_snapshot btrfs: qgroup: Fix qgroup accounting when creating snapshot Btrfs: fix fspath error deallocation btrfs: make find_workspace warn if there are no workspaces btrfs: make find_workspace always succeed ...
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/compression.c85
-rw-r--r--fs/btrfs/ctree.c6
-rw-r--r--fs/btrfs/ctree.h1123
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/dev-replace.c101
-rw-r--r--fs/btrfs/dev-replace.h4
-rw-r--r--fs/btrfs/disk-io.c130
-rw-r--r--fs/btrfs/extent-tree.c167
-rw-r--r--fs/btrfs/extent_io.c82
-rw-r--r--fs/btrfs/extent_io.h1
-rw-r--r--fs/btrfs/file.c6
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode.c466
-rw-r--r--fs/btrfs/ioctl.c198
-rw-r--r--fs/btrfs/ordered-data.c26
-rw-r--r--fs/btrfs/ordered-data.h6
-rw-r--r--fs/btrfs/relocation.c13
-rw-r--r--fs/btrfs/root-tree.c4
-rw-r--r--fs/btrfs/scrub.c25
-rw-r--r--fs/btrfs/send.c62
-rw-r--r--fs/btrfs/super.c60
-rw-r--r--fs/btrfs/sysfs.c14
-rw-r--r--fs/btrfs/transaction.c138
-rw-r--r--fs/btrfs/tree-log.c74
-rw-r--r--fs/btrfs/volumes.c454
-rw-r--r--fs/btrfs/volumes.h57
28 files changed, 1530 insertions, 1788 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 80e8472d618b..d3090187fd76 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1991,7 +1991,7 @@ struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
1991 1991
1992 ifp = kmalloc(sizeof(*ifp), GFP_NOFS); 1992 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
1993 if (!ifp) { 1993 if (!ifp) {
1994 kfree(fspath); 1994 vfree(fspath);
1995 return ERR_PTR(-ENOMEM); 1995 return ERR_PTR(-ENOMEM);
1996 } 1996 }
1997 1997
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 61205e3bbefa..1da5753d886d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -196,6 +196,16 @@ struct btrfs_inode {
196 struct list_head delayed_iput; 196 struct list_head delayed_iput;
197 long delayed_iput_count; 197 long delayed_iput_count;
198 198
199 /*
200 * To avoid races between lockless (i_mutex not held) direct IO writes
201 * and concurrent fsync requests. Direct IO writes must acquire read
202 * access on this semaphore for creating an extent map and its
203 * corresponding ordered extent. The fast fsync path must acquire write
204 * access on this semaphore before it collects ordered extents and
205 * extent maps.
206 */
207 struct rw_semaphore dio_sem;
208
199 struct inode vfs_inode; 209 struct inode vfs_inode;
200}; 210};
201 211
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ff61a41ac90b..658c39b70fba 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -743,8 +743,11 @@ out:
743static struct { 743static struct {
744 struct list_head idle_ws; 744 struct list_head idle_ws;
745 spinlock_t ws_lock; 745 spinlock_t ws_lock;
746 int num_ws; 746 /* Number of free workspaces */
747 atomic_t alloc_ws; 747 int free_ws;
748 /* Total number of allocated workspaces */
749 atomic_t total_ws;
750 /* Waiters for a free workspace */
748 wait_queue_head_t ws_wait; 751 wait_queue_head_t ws_wait;
749} btrfs_comp_ws[BTRFS_COMPRESS_TYPES]; 752} btrfs_comp_ws[BTRFS_COMPRESS_TYPES];
750 753
@@ -758,16 +761,34 @@ void __init btrfs_init_compress(void)
758 int i; 761 int i;
759 762
760 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) { 763 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
764 struct list_head *workspace;
765
761 INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws); 766 INIT_LIST_HEAD(&btrfs_comp_ws[i].idle_ws);
762 spin_lock_init(&btrfs_comp_ws[i].ws_lock); 767 spin_lock_init(&btrfs_comp_ws[i].ws_lock);
763 atomic_set(&btrfs_comp_ws[i].alloc_ws, 0); 768 atomic_set(&btrfs_comp_ws[i].total_ws, 0);
764 init_waitqueue_head(&btrfs_comp_ws[i].ws_wait); 769 init_waitqueue_head(&btrfs_comp_ws[i].ws_wait);
770
771 /*
772 * Preallocate one workspace for each compression type so
773 * we can guarantee forward progress in the worst case
774 */
775 workspace = btrfs_compress_op[i]->alloc_workspace();
776 if (IS_ERR(workspace)) {
777 printk(KERN_WARNING
778 "BTRFS: cannot preallocate compression workspace, will try later");
779 } else {
780 atomic_set(&btrfs_comp_ws[i].total_ws, 1);
781 btrfs_comp_ws[i].free_ws = 1;
782 list_add(workspace, &btrfs_comp_ws[i].idle_ws);
783 }
765 } 784 }
766} 785}
767 786
768/* 787/*
769 * this finds an available workspace or allocates a new one 788 * This finds an available workspace or allocates a new one.
770 * ERR_PTR is returned if things go bad. 789 * If it's not possible to allocate a new one, waits until there's one.
790 * Preallocation makes a forward progress guarantees and we do not return
791 * errors.
771 */ 792 */
772static struct list_head *find_workspace(int type) 793static struct list_head *find_workspace(int type)
773{ 794{
@@ -777,36 +798,58 @@ static struct list_head *find_workspace(int type)
777 798
778 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; 799 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
779 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; 800 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
780 atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; 801 atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
781 wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; 802 wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
782 int *num_ws = &btrfs_comp_ws[idx].num_ws; 803 int *free_ws = &btrfs_comp_ws[idx].free_ws;
783again: 804again:
784 spin_lock(ws_lock); 805 spin_lock(ws_lock);
785 if (!list_empty(idle_ws)) { 806 if (!list_empty(idle_ws)) {
786 workspace = idle_ws->next; 807 workspace = idle_ws->next;
787 list_del(workspace); 808 list_del(workspace);
788 (*num_ws)--; 809 (*free_ws)--;
789 spin_unlock(ws_lock); 810 spin_unlock(ws_lock);
790 return workspace; 811 return workspace;
791 812
792 } 813 }
793 if (atomic_read(alloc_ws) > cpus) { 814 if (atomic_read(total_ws) > cpus) {
794 DEFINE_WAIT(wait); 815 DEFINE_WAIT(wait);
795 816
796 spin_unlock(ws_lock); 817 spin_unlock(ws_lock);
797 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE); 818 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
798 if (atomic_read(alloc_ws) > cpus && !*num_ws) 819 if (atomic_read(total_ws) > cpus && !*free_ws)
799 schedule(); 820 schedule();
800 finish_wait(ws_wait, &wait); 821 finish_wait(ws_wait, &wait);
801 goto again; 822 goto again;
802 } 823 }
803 atomic_inc(alloc_ws); 824 atomic_inc(total_ws);
804 spin_unlock(ws_lock); 825 spin_unlock(ws_lock);
805 826
806 workspace = btrfs_compress_op[idx]->alloc_workspace(); 827 workspace = btrfs_compress_op[idx]->alloc_workspace();
807 if (IS_ERR(workspace)) { 828 if (IS_ERR(workspace)) {
808 atomic_dec(alloc_ws); 829 atomic_dec(total_ws);
809 wake_up(ws_wait); 830 wake_up(ws_wait);
831
832 /*
833 * Do not return the error but go back to waiting. There's a
834 * workspace preallocated for each type and the compression
835 * time is bounded so we get to a workspace eventually. This
836 * makes our caller's life easier.
837 *
838 * To prevent silent and low-probability deadlocks (when the
839 * initial preallocation fails), check if there are any
840 * workspaces at all.
841 */
842 if (atomic_read(total_ws) == 0) {
843 static DEFINE_RATELIMIT_STATE(_rs,
844 /* once per minute */ 60 * HZ,
845 /* no burst */ 1);
846
847 if (__ratelimit(&_rs)) {
848 printk(KERN_WARNING
849 "no compression workspaces, low memory, retrying");
850 }
851 }
852 goto again;
810 } 853 }
811 return workspace; 854 return workspace;
812} 855}
@@ -820,21 +863,21 @@ static void free_workspace(int type, struct list_head *workspace)
820 int idx = type - 1; 863 int idx = type - 1;
821 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws; 864 struct list_head *idle_ws = &btrfs_comp_ws[idx].idle_ws;
822 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock; 865 spinlock_t *ws_lock = &btrfs_comp_ws[idx].ws_lock;
823 atomic_t *alloc_ws = &btrfs_comp_ws[idx].alloc_ws; 866 atomic_t *total_ws = &btrfs_comp_ws[idx].total_ws;
824 wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait; 867 wait_queue_head_t *ws_wait = &btrfs_comp_ws[idx].ws_wait;
825 int *num_ws = &btrfs_comp_ws[idx].num_ws; 868 int *free_ws = &btrfs_comp_ws[idx].free_ws;
826 869
827 spin_lock(ws_lock); 870 spin_lock(ws_lock);
828 if (*num_ws < num_online_cpus()) { 871 if (*free_ws < num_online_cpus()) {
829 list_add(workspace, idle_ws); 872 list_add(workspace, idle_ws);
830 (*num_ws)++; 873 (*free_ws)++;
831 spin_unlock(ws_lock); 874 spin_unlock(ws_lock);
832 goto wake; 875 goto wake;
833 } 876 }
834 spin_unlock(ws_lock); 877 spin_unlock(ws_lock);
835 878
836 btrfs_compress_op[idx]->free_workspace(workspace); 879 btrfs_compress_op[idx]->free_workspace(workspace);
837 atomic_dec(alloc_ws); 880 atomic_dec(total_ws);
838wake: 881wake:
839 /* 882 /*
840 * Make sure counter is updated before we wake up waiters. 883 * Make sure counter is updated before we wake up waiters.
@@ -857,7 +900,7 @@ static void free_workspaces(void)
857 workspace = btrfs_comp_ws[i].idle_ws.next; 900 workspace = btrfs_comp_ws[i].idle_ws.next;
858 list_del(workspace); 901 list_del(workspace);
859 btrfs_compress_op[i]->free_workspace(workspace); 902 btrfs_compress_op[i]->free_workspace(workspace);
860 atomic_dec(&btrfs_comp_ws[i].alloc_ws); 903 atomic_dec(&btrfs_comp_ws[i].total_ws);
861 } 904 }
862 } 905 }
863} 906}
@@ -894,8 +937,6 @@ int btrfs_compress_pages(int type, struct address_space *mapping,
894 int ret; 937 int ret;
895 938
896 workspace = find_workspace(type); 939 workspace = find_workspace(type);
897 if (IS_ERR(workspace))
898 return PTR_ERR(workspace);
899 940
900 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping, 941 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
901 start, len, pages, 942 start, len, pages,
@@ -930,8 +971,6 @@ static int btrfs_decompress_biovec(int type, struct page **pages_in,
930 int ret; 971 int ret;
931 972
932 workspace = find_workspace(type); 973 workspace = find_workspace(type);
933 if (IS_ERR(workspace))
934 return PTR_ERR(workspace);
935 974
936 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in, 975 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
937 disk_start, 976 disk_start,
@@ -952,8 +991,6 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
952 int ret; 991 int ret;
953 992
954 workspace = find_workspace(type); 993 workspace = find_workspace(type);
955 if (IS_ERR(workspace))
956 return PTR_ERR(workspace);
957 994
958 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in, 995 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
959 dest_page, start_byte, 996 dest_page, start_byte,
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ec7928a27aaa..decd0a3f5d61 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1011,7 +1011,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
1011 return ret; 1011 return ret;
1012 if (refs == 0) { 1012 if (refs == 0) {
1013 ret = -EROFS; 1013 ret = -EROFS;
1014 btrfs_std_error(root->fs_info, ret, NULL); 1014 btrfs_handle_fs_error(root->fs_info, ret, NULL);
1015 return ret; 1015 return ret;
1016 } 1016 }
1017 } else { 1017 } else {
@@ -1928,7 +1928,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1928 child = read_node_slot(root, mid, 0); 1928 child = read_node_slot(root, mid, 0);
1929 if (!child) { 1929 if (!child) {
1930 ret = -EROFS; 1930 ret = -EROFS;
1931 btrfs_std_error(root->fs_info, ret, NULL); 1931 btrfs_handle_fs_error(root->fs_info, ret, NULL);
1932 goto enospc; 1932 goto enospc;
1933 } 1933 }
1934 1934
@@ -2031,7 +2031,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
2031 */ 2031 */
2032 if (!left) { 2032 if (!left) {
2033 ret = -EROFS; 2033 ret = -EROFS;
2034 btrfs_std_error(root->fs_info, ret, NULL); 2034 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2035 goto enospc; 2035 goto enospc;
2036 } 2036 }
2037 wret = balance_node_right(trans, root, mid, left); 2037 wret = balance_node_right(trans, root, mid, left);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 84a6a5b3384a..ddcc58f03c79 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -33,6 +33,7 @@
33#include <asm/kmap_types.h> 33#include <asm/kmap_types.h>
34#include <linux/pagemap.h> 34#include <linux/pagemap.h>
35#include <linux/btrfs.h> 35#include <linux/btrfs.h>
36#include <linux/btrfs_tree.h>
36#include <linux/workqueue.h> 37#include <linux/workqueue.h>
37#include <linux/security.h> 38#include <linux/security.h>
38#include <linux/sizes.h> 39#include <linux/sizes.h>
@@ -64,98 +65,6 @@ struct btrfs_ordered_sum;
64 65
65#define BTRFS_COMPAT_EXTENT_TREE_V0 66#define BTRFS_COMPAT_EXTENT_TREE_V0
66 67
67/* holds pointers to all of the tree roots */
68#define BTRFS_ROOT_TREE_OBJECTID 1ULL
69
70/* stores information about which extents are in use, and reference counts */
71#define BTRFS_EXTENT_TREE_OBJECTID 2ULL
72
73/*
74 * chunk tree stores translations from logical -> physical block numbering
75 * the super block points to the chunk tree
76 */
77#define BTRFS_CHUNK_TREE_OBJECTID 3ULL
78
79/*
80 * stores information about which areas of a given device are in use.
81 * one per device. The tree of tree roots points to the device tree
82 */
83#define BTRFS_DEV_TREE_OBJECTID 4ULL
84
85/* one per subvolume, storing files and directories */
86#define BTRFS_FS_TREE_OBJECTID 5ULL
87
88/* directory objectid inside the root tree */
89#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL
90
91/* holds checksums of all the data extents */
92#define BTRFS_CSUM_TREE_OBJECTID 7ULL
93
94/* holds quota configuration and tracking */
95#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
96
97/* for storing items that use the BTRFS_UUID_KEY* types */
98#define BTRFS_UUID_TREE_OBJECTID 9ULL
99
100/* tracks free space in block groups. */
101#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
102
103/* device stats in the device tree */
104#define BTRFS_DEV_STATS_OBJECTID 0ULL
105
106/* for storing balance parameters in the root tree */
107#define BTRFS_BALANCE_OBJECTID -4ULL
108
109/* orhpan objectid for tracking unlinked/truncated files */
110#define BTRFS_ORPHAN_OBJECTID -5ULL
111
112/* does write ahead logging to speed up fsyncs */
113#define BTRFS_TREE_LOG_OBJECTID -6ULL
114#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL
115
116/* for space balancing */
117#define BTRFS_TREE_RELOC_OBJECTID -8ULL
118#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL
119
120/*
121 * extent checksums all have this objectid
122 * this allows them to share the logging tree
123 * for fsyncs
124 */
125#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
126
127/* For storing free space cache */
128#define BTRFS_FREE_SPACE_OBJECTID -11ULL
129
130/*
131 * The inode number assigned to the special inode for storing
132 * free ino cache
133 */
134#define BTRFS_FREE_INO_OBJECTID -12ULL
135
136/* dummy objectid represents multiple objectids */
137#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
138
139/*
140 * All files have objectids in this range.
141 */
142#define BTRFS_FIRST_FREE_OBJECTID 256ULL
143#define BTRFS_LAST_FREE_OBJECTID -256ULL
144#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL
145
146
147/*
148 * the device items go into the chunk tree. The key is in the form
149 * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
150 */
151#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
152
153#define BTRFS_BTREE_INODE_OBJECTID 1
154
155#define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
156
157#define BTRFS_DEV_REPLACE_DEVID 0ULL
158
159/* 68/*
160 * the max metadata block size. This limit is somewhat artificial, 69 * the max metadata block size. This limit is somewhat artificial,
161 * but the memmove costs go through the roof for larger blocks. 70 * but the memmove costs go through the roof for larger blocks.
@@ -175,12 +84,6 @@ struct btrfs_ordered_sum;
175 */ 84 */
176#define BTRFS_LINK_MAX 65535U 85#define BTRFS_LINK_MAX 65535U
177 86
178/* 32 bytes in various csum fields */
179#define BTRFS_CSUM_SIZE 32
180
181/* csum types */
182#define BTRFS_CSUM_TYPE_CRC32 0
183
184static const int btrfs_csum_sizes[] = { 4 }; 87static const int btrfs_csum_sizes[] = { 4 };
185 88
186/* four bytes for CRC32 */ 89/* four bytes for CRC32 */
@@ -189,17 +92,6 @@ static const int btrfs_csum_sizes[] = { 4 };
189/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */ 92/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
190#define REQ_GET_READ_MIRRORS (1 << 30) 93#define REQ_GET_READ_MIRRORS (1 << 30)
191 94
192#define BTRFS_FT_UNKNOWN 0
193#define BTRFS_FT_REG_FILE 1
194#define BTRFS_FT_DIR 2
195#define BTRFS_FT_CHRDEV 3
196#define BTRFS_FT_BLKDEV 4
197#define BTRFS_FT_FIFO 5
198#define BTRFS_FT_SOCK 6
199#define BTRFS_FT_SYMLINK 7
200#define BTRFS_FT_XATTR 8
201#define BTRFS_FT_MAX 9
202
203/* ioprio of readahead is set to idle */ 95/* ioprio of readahead is set to idle */
204#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) 96#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
205 97
@@ -207,138 +99,10 @@ static const int btrfs_csum_sizes[] = { 4 };
207 99
208#define BTRFS_MAX_EXTENT_SIZE SZ_128M 100#define BTRFS_MAX_EXTENT_SIZE SZ_128M
209 101
210/*
211 * The key defines the order in the tree, and so it also defines (optimal)
212 * block layout.
213 *
214 * objectid corresponds to the inode number.
215 *
216 * type tells us things about the object, and is a kind of stream selector.
217 * so for a given inode, keys with type of 1 might refer to the inode data,
218 * type of 2 may point to file data in the btree and type == 3 may point to
219 * extents.
220 *
221 * offset is the starting byte offset for this key in the stream.
222 *
223 * btrfs_disk_key is in disk byte order. struct btrfs_key is always
224 * in cpu native order. Otherwise they are identical and their sizes
225 * should be the same (ie both packed)
226 */
227struct btrfs_disk_key {
228 __le64 objectid;
229 u8 type;
230 __le64 offset;
231} __attribute__ ((__packed__));
232
233struct btrfs_key {
234 u64 objectid;
235 u8 type;
236 u64 offset;
237} __attribute__ ((__packed__));
238
239struct btrfs_mapping_tree { 102struct btrfs_mapping_tree {
240 struct extent_map_tree map_tree; 103 struct extent_map_tree map_tree;
241}; 104};
242 105
243struct btrfs_dev_item {
244 /* the internal btrfs device id */
245 __le64 devid;
246
247 /* size of the device */
248 __le64 total_bytes;
249
250 /* bytes used */
251 __le64 bytes_used;
252
253 /* optimal io alignment for this device */
254 __le32 io_align;
255
256 /* optimal io width for this device */
257 __le32 io_width;
258
259 /* minimal io size for this device */
260 __le32 sector_size;
261
262 /* type and info about this device */
263 __le64 type;
264
265 /* expected generation for this device */
266 __le64 generation;
267
268 /*
269 * starting byte of this partition on the device,
270 * to allow for stripe alignment in the future
271 */
272 __le64 start_offset;
273
274 /* grouping information for allocation decisions */
275 __le32 dev_group;
276
277 /* seek speed 0-100 where 100 is fastest */
278 u8 seek_speed;
279
280 /* bandwidth 0-100 where 100 is fastest */
281 u8 bandwidth;
282
283 /* btrfs generated uuid for this device */
284 u8 uuid[BTRFS_UUID_SIZE];
285
286 /* uuid of FS who owns this device */
287 u8 fsid[BTRFS_UUID_SIZE];
288} __attribute__ ((__packed__));
289
290struct btrfs_stripe {
291 __le64 devid;
292 __le64 offset;
293 u8 dev_uuid[BTRFS_UUID_SIZE];
294} __attribute__ ((__packed__));
295
296struct btrfs_chunk {
297 /* size of this chunk in bytes */
298 __le64 length;
299
300 /* objectid of the root referencing this chunk */
301 __le64 owner;
302
303 __le64 stripe_len;
304 __le64 type;
305
306 /* optimal io alignment for this chunk */
307 __le32 io_align;
308
309 /* optimal io width for this chunk */
310 __le32 io_width;
311
312 /* minimal io size for this chunk */
313 __le32 sector_size;
314
315 /* 2^16 stripes is quite a lot, a second limit is the size of a single
316 * item in the btree
317 */
318 __le16 num_stripes;
319
320 /* sub stripes only matter for raid10 */
321 __le16 sub_stripes;
322 struct btrfs_stripe stripe;
323 /* additional stripes go here */
324} __attribute__ ((__packed__));
325
326#define BTRFS_FREE_SPACE_EXTENT 1
327#define BTRFS_FREE_SPACE_BITMAP 2
328
329struct btrfs_free_space_entry {
330 __le64 offset;
331 __le64 bytes;
332 u8 type;
333} __attribute__ ((__packed__));
334
335struct btrfs_free_space_header {
336 struct btrfs_disk_key location;
337 __le64 generation;
338 __le64 num_entries;
339 __le64 num_bitmaps;
340} __attribute__ ((__packed__));
341
342static inline unsigned long btrfs_chunk_item_size(int num_stripes) 106static inline unsigned long btrfs_chunk_item_size(int num_stripes)
343{ 107{
344 BUG_ON(num_stripes == 0); 108 BUG_ON(num_stripes == 0);
@@ -346,9 +110,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
346 sizeof(struct btrfs_stripe) * (num_stripes - 1); 110 sizeof(struct btrfs_stripe) * (num_stripes - 1);
347} 111}
348 112
349#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
350#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
351
352/* 113/*
353 * File system states 114 * File system states
354 */ 115 */
@@ -357,13 +118,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
357#define BTRFS_FS_STATE_TRANS_ABORTED 2 118#define BTRFS_FS_STATE_TRANS_ABORTED 2
358#define BTRFS_FS_STATE_DEV_REPLACING 3 119#define BTRFS_FS_STATE_DEV_REPLACING 3
359 120
360/* Super block flags */
361/* Errors detected */
362#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
363
364#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
365#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
366
367#define BTRFS_BACKREF_REV_MAX 256 121#define BTRFS_BACKREF_REV_MAX 256
368#define BTRFS_BACKREF_REV_SHIFT 56 122#define BTRFS_BACKREF_REV_SHIFT 56
369#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ 123#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \
@@ -410,7 +164,6 @@ struct btrfs_header {
410 * room to translate 14 chunks with 3 stripes each. 164 * room to translate 14 chunks with 3 stripes each.
411 */ 165 */
412#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 166#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
413#define BTRFS_LABEL_SIZE 256
414 167
415/* 168/*
416 * just in case we somehow lose the roots and are not able to mount, 169 * just in case we somehow lose the roots and are not able to mount,
@@ -507,31 +260,6 @@ struct btrfs_super_block {
507 * Compat flags that we support. If any incompat flags are set other than the 260 * Compat flags that we support. If any incompat flags are set other than the
508 * ones specified below then we will fail to mount 261 * ones specified below then we will fail to mount
509 */ 262 */
510#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0)
511
512#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
513#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
514#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
515#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
516/*
517 * some patches floated around with a second compression method
518 * lets save that incompat here for when they do get in
519 * Note we don't actually support it, we're just reserving the
520 * number
521 */
522#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2 (1ULL << 4)
523
524/*
525 * older kernels tried to do bigger metadata blocks, but the
526 * code was pretty buggy. Lets not let them try anymore.
527 */
528#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA (1ULL << 5)
529
530#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF (1ULL << 6)
531#define BTRFS_FEATURE_INCOMPAT_RAID56 (1ULL << 7)
532#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA (1ULL << 8)
533#define BTRFS_FEATURE_INCOMPAT_NO_HOLES (1ULL << 9)
534
535#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 263#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
536#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL 264#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
537#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL 265#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
@@ -624,357 +352,8 @@ struct btrfs_path {
624 unsigned int need_commit_sem:1; 352 unsigned int need_commit_sem:1;
625 unsigned int skip_release_on_error:1; 353 unsigned int skip_release_on_error:1;
626}; 354};
627
628/*
629 * items in the extent btree are used to record the objectid of the
630 * owner of the block and the number of references
631 */
632
633struct btrfs_extent_item {
634 __le64 refs;
635 __le64 generation;
636 __le64 flags;
637} __attribute__ ((__packed__));
638
639struct btrfs_extent_item_v0 {
640 __le32 refs;
641} __attribute__ ((__packed__));
642
643#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \ 355#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
644 sizeof(struct btrfs_item)) 356 sizeof(struct btrfs_item))
645
646#define BTRFS_EXTENT_FLAG_DATA (1ULL << 0)
647#define BTRFS_EXTENT_FLAG_TREE_BLOCK (1ULL << 1)
648
649/* following flags only apply to tree blocks */
650
651/* use full backrefs for extent pointers in the block */
652#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
653
654/*
655 * this flag is only used internally by scrub and may be changed at any time
656 * it is only declared here to avoid collisions
657 */
658#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
659
660struct btrfs_tree_block_info {
661 struct btrfs_disk_key key;
662 u8 level;
663} __attribute__ ((__packed__));
664
665struct btrfs_extent_data_ref {
666 __le64 root;
667 __le64 objectid;
668 __le64 offset;
669 __le32 count;
670} __attribute__ ((__packed__));
671
672struct btrfs_shared_data_ref {
673 __le32 count;
674} __attribute__ ((__packed__));
675
676struct btrfs_extent_inline_ref {
677 u8 type;
678 __le64 offset;
679} __attribute__ ((__packed__));
680
681/* old style backrefs item */
682struct btrfs_extent_ref_v0 {
683 __le64 root;
684 __le64 generation;
685 __le64 objectid;
686 __le32 count;
687} __attribute__ ((__packed__));
688
689
690/* dev extents record free space on individual devices. The owner
691 * field points back to the chunk allocation mapping tree that allocated
692 * the extent. The chunk tree uuid field is a way to double check the owner
693 */
694struct btrfs_dev_extent {
695 __le64 chunk_tree;
696 __le64 chunk_objectid;
697 __le64 chunk_offset;
698 __le64 length;
699 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
700} __attribute__ ((__packed__));
701
702struct btrfs_inode_ref {
703 __le64 index;
704 __le16 name_len;
705 /* name goes here */
706} __attribute__ ((__packed__));
707
708struct btrfs_inode_extref {
709 __le64 parent_objectid;
710 __le64 index;
711 __le16 name_len;
712 __u8 name[0];
713 /* name goes here */
714} __attribute__ ((__packed__));
715
716struct btrfs_timespec {
717 __le64 sec;
718 __le32 nsec;
719} __attribute__ ((__packed__));
720
721struct btrfs_inode_item {
722 /* nfs style generation number */
723 __le64 generation;
724 /* transid that last touched this inode */
725 __le64 transid;
726 __le64 size;
727 __le64 nbytes;
728 __le64 block_group;
729 __le32 nlink;
730 __le32 uid;
731 __le32 gid;
732 __le32 mode;
733 __le64 rdev;
734 __le64 flags;
735
736 /* modification sequence number for NFS */
737 __le64 sequence;
738
739 /*
740 * a little future expansion, for more than this we can
741 * just grow the inode item and version it
742 */
743 __le64 reserved[4];
744 struct btrfs_timespec atime;
745 struct btrfs_timespec ctime;
746 struct btrfs_timespec mtime;
747 struct btrfs_timespec otime;
748} __attribute__ ((__packed__));
749
750struct btrfs_dir_log_item {
751 __le64 end;
752} __attribute__ ((__packed__));
753
754struct btrfs_dir_item {
755 struct btrfs_disk_key location;
756 __le64 transid;
757 __le16 data_len;
758 __le16 name_len;
759 u8 type;
760} __attribute__ ((__packed__));
761
762#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
763
764/*
765 * Internal in-memory flag that a subvolume has been marked for deletion but
766 * still visible as a directory
767 */
768#define BTRFS_ROOT_SUBVOL_DEAD (1ULL << 48)
769
770struct btrfs_root_item {
771 struct btrfs_inode_item inode;
772 __le64 generation;
773 __le64 root_dirid;
774 __le64 bytenr;
775 __le64 byte_limit;
776 __le64 bytes_used;
777 __le64 last_snapshot;
778 __le64 flags;
779 __le32 refs;
780 struct btrfs_disk_key drop_progress;
781 u8 drop_level;
782 u8 level;
783
784 /*
785 * The following fields appear after subvol_uuids+subvol_times
786 * were introduced.
787 */
788
789 /*
790 * This generation number is used to test if the new fields are valid
791 * and up to date while reading the root item. Every time the root item
792 * is written out, the "generation" field is copied into this field. If
793 * anyone ever mounted the fs with an older kernel, we will have
794 * mismatching generation values here and thus must invalidate the
795 * new fields. See btrfs_update_root and btrfs_find_last_root for
796 * details.
797 * the offset of generation_v2 is also used as the start for the memset
798 * when invalidating the fields.
799 */
800 __le64 generation_v2;
801 u8 uuid[BTRFS_UUID_SIZE];
802 u8 parent_uuid[BTRFS_UUID_SIZE];
803 u8 received_uuid[BTRFS_UUID_SIZE];
804 __le64 ctransid; /* updated when an inode changes */
805 __le64 otransid; /* trans when created */
806 __le64 stransid; /* trans when sent. non-zero for received subvol */
807 __le64 rtransid; /* trans when received. non-zero for received subvol */
808 struct btrfs_timespec ctime;
809 struct btrfs_timespec otime;
810 struct btrfs_timespec stime;
811 struct btrfs_timespec rtime;
812 __le64 reserved[8]; /* for future */
813} __attribute__ ((__packed__));
814
815/*
816 * this is used for both forward and backward root refs
817 */
818struct btrfs_root_ref {
819 __le64 dirid;
820 __le64 sequence;
821 __le16 name_len;
822} __attribute__ ((__packed__));
823
824struct btrfs_disk_balance_args {
825 /*
826 * profiles to operate on, single is denoted by
827 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
828 */
829 __le64 profiles;
830
831 /*
832 * usage filter
833 * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N'
834 * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max
835 */
836 union {
837 __le64 usage;
838 struct {
839 __le32 usage_min;
840 __le32 usage_max;
841 };
842 };
843
844 /* devid filter */
845 __le64 devid;
846
847 /* devid subset filter [pstart..pend) */
848 __le64 pstart;
849 __le64 pend;
850
851 /* btrfs virtual address space subset filter [vstart..vend) */
852 __le64 vstart;
853 __le64 vend;
854
855 /*
856 * profile to convert to, single is denoted by
857 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
858 */
859 __le64 target;
860
861 /* BTRFS_BALANCE_ARGS_* */
862 __le64 flags;
863
864 /*
865 * BTRFS_BALANCE_ARGS_LIMIT with value 'limit'
866 * BTRFS_BALANCE_ARGS_LIMIT_RANGE - the extend version can use minimum
867 * and maximum
868 */
869 union {
870 __le64 limit;
871 struct {
872 __le32 limit_min;
873 __le32 limit_max;
874 };
875 };
876
877 /*
878 * Process chunks that cross stripes_min..stripes_max devices,
879 * BTRFS_BALANCE_ARGS_STRIPES_RANGE
880 */
881 __le32 stripes_min;
882 __le32 stripes_max;
883
884 __le64 unused[6];
885} __attribute__ ((__packed__));
886
887/*
888 * store balance parameters to disk so that balance can be properly
889 * resumed after crash or unmount
890 */
891struct btrfs_balance_item {
892 /* BTRFS_BALANCE_* */
893 __le64 flags;
894
895 struct btrfs_disk_balance_args data;
896 struct btrfs_disk_balance_args meta;
897 struct btrfs_disk_balance_args sys;
898
899 __le64 unused[4];
900} __attribute__ ((__packed__));
901
902#define BTRFS_FILE_EXTENT_INLINE 0
903#define BTRFS_FILE_EXTENT_REG 1
904#define BTRFS_FILE_EXTENT_PREALLOC 2
905
906struct btrfs_file_extent_item {
907 /*
908 * transaction id that created this extent
909 */
910 __le64 generation;
911 /*
912 * max number of bytes to hold this extent in ram
913 * when we split a compressed extent we can't know how big
914 * each of the resulting pieces will be. So, this is
915 * an upper limit on the size of the extent in ram instead of
916 * an exact limit.
917 */
918 __le64 ram_bytes;
919
920 /*
921 * 32 bits for the various ways we might encode the data,
922 * including compression and encryption. If any of these
923 * are set to something a given disk format doesn't understand
924 * it is treated like an incompat flag for reading and writing,
925 * but not for stat.
926 */
927 u8 compression;
928 u8 encryption;
929 __le16 other_encoding; /* spare for later use */
930
931 /* are we inline data or a real extent? */
932 u8 type;
933
934 /*
935 * disk space consumed by the extent, checksum blocks are included
936 * in these numbers
937 *
938 * At this offset in the structure, the inline extent data start.
939 */
940 __le64 disk_bytenr;
941 __le64 disk_num_bytes;
942 /*
943 * the logical offset in file blocks (no csums)
944 * this extent record is for. This allows a file extent to point
945 * into the middle of an existing extent on disk, sharing it
946 * between two snapshots (useful if some bytes in the middle of the
947 * extent have changed
948 */
949 __le64 offset;
950 /*
951 * the logical number of file blocks (no csums included). This
952 * always reflects the size uncompressed and without encoding.
953 */
954 __le64 num_bytes;
955
956} __attribute__ ((__packed__));
957
958struct btrfs_csum_item {
959 u8 csum;
960} __attribute__ ((__packed__));
961
962struct btrfs_dev_stats_item {
963 /*
964 * grow this item struct at the end for future enhancements and keep
965 * the existing values unchanged
966 */
967 __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
968} __attribute__ ((__packed__));
969
970#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS 0
971#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID 1
972#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED 0
973#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED 1
974#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED 2
975#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED 3
976#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED 4
977
978struct btrfs_dev_replace { 357struct btrfs_dev_replace {
979 u64 replace_state; /* see #define above */ 358 u64 replace_state; /* see #define above */
980 u64 time_started; /* seconds since 1-Jan-1970 */ 359 u64 time_started; /* seconds since 1-Jan-1970 */
@@ -1005,175 +384,6 @@ struct btrfs_dev_replace {
1005 struct btrfs_scrub_progress scrub_progress; 384 struct btrfs_scrub_progress scrub_progress;
1006}; 385};
1007 386
1008struct btrfs_dev_replace_item {
1009 /*
1010 * grow this item struct at the end for future enhancements and keep
1011 * the existing values unchanged
1012 */
1013 __le64 src_devid;
1014 __le64 cursor_left;
1015 __le64 cursor_right;
1016 __le64 cont_reading_from_srcdev_mode;
1017
1018 __le64 replace_state;
1019 __le64 time_started;
1020 __le64 time_stopped;
1021 __le64 num_write_errors;
1022 __le64 num_uncorrectable_read_errors;
1023} __attribute__ ((__packed__));
1024
1025/* different types of block groups (and chunks) */
1026#define BTRFS_BLOCK_GROUP_DATA (1ULL << 0)
1027#define BTRFS_BLOCK_GROUP_SYSTEM (1ULL << 1)
1028#define BTRFS_BLOCK_GROUP_METADATA (1ULL << 2)
1029#define BTRFS_BLOCK_GROUP_RAID0 (1ULL << 3)
1030#define BTRFS_BLOCK_GROUP_RAID1 (1ULL << 4)
1031#define BTRFS_BLOCK_GROUP_DUP (1ULL << 5)
1032#define BTRFS_BLOCK_GROUP_RAID10 (1ULL << 6)
1033#define BTRFS_BLOCK_GROUP_RAID5 (1ULL << 7)
1034#define BTRFS_BLOCK_GROUP_RAID6 (1ULL << 8)
1035#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
1036 BTRFS_SPACE_INFO_GLOBAL_RSV)
1037
1038enum btrfs_raid_types {
1039 BTRFS_RAID_RAID10,
1040 BTRFS_RAID_RAID1,
1041 BTRFS_RAID_DUP,
1042 BTRFS_RAID_RAID0,
1043 BTRFS_RAID_SINGLE,
1044 BTRFS_RAID_RAID5,
1045 BTRFS_RAID_RAID6,
1046 BTRFS_NR_RAID_TYPES
1047};
1048
1049#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
1050 BTRFS_BLOCK_GROUP_SYSTEM | \
1051 BTRFS_BLOCK_GROUP_METADATA)
1052
1053#define BTRFS_BLOCK_GROUP_PROFILE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \
1054 BTRFS_BLOCK_GROUP_RAID1 | \
1055 BTRFS_BLOCK_GROUP_RAID5 | \
1056 BTRFS_BLOCK_GROUP_RAID6 | \
1057 BTRFS_BLOCK_GROUP_DUP | \
1058 BTRFS_BLOCK_GROUP_RAID10)
1059#define BTRFS_BLOCK_GROUP_RAID56_MASK (BTRFS_BLOCK_GROUP_RAID5 | \
1060 BTRFS_BLOCK_GROUP_RAID6)
1061
1062/*
1063 * We need a bit for restriper to be able to tell when chunks of type
1064 * SINGLE are available. This "extended" profile format is used in
1065 * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
1066 * (on-disk). The corresponding on-disk bit in chunk.type is reserved
1067 * to avoid remappings between two formats in future.
1068 */
1069#define BTRFS_AVAIL_ALLOC_BIT_SINGLE (1ULL << 48)
1070
1071/*
1072 * A fake block group type that is used to communicate global block reserve
1073 * size to userspace via the SPACE_INFO ioctl.
1074 */
1075#define BTRFS_SPACE_INFO_GLOBAL_RSV (1ULL << 49)
1076
1077#define BTRFS_EXTENDED_PROFILE_MASK (BTRFS_BLOCK_GROUP_PROFILE_MASK | \
1078 BTRFS_AVAIL_ALLOC_BIT_SINGLE)
1079
1080static inline u64 chunk_to_extended(u64 flags)
1081{
1082 if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
1083 flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
1084
1085 return flags;
1086}
1087static inline u64 extended_to_chunk(u64 flags)
1088{
1089 return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
1090}
1091
1092struct btrfs_block_group_item {
1093 __le64 used;
1094 __le64 chunk_objectid;
1095 __le64 flags;
1096} __attribute__ ((__packed__));
1097
1098struct btrfs_free_space_info {
1099 __le32 extent_count;
1100 __le32 flags;
1101} __attribute__ ((__packed__));
1102
1103#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
1104
1105#define BTRFS_QGROUP_LEVEL_SHIFT 48
1106static inline u64 btrfs_qgroup_level(u64 qgroupid)
1107{
1108 return qgroupid >> BTRFS_QGROUP_LEVEL_SHIFT;
1109}
1110
1111/*
1112 * is subvolume quota turned on?
1113 */
1114#define BTRFS_QGROUP_STATUS_FLAG_ON (1ULL << 0)
1115/*
1116 * RESCAN is set during the initialization phase
1117 */
1118#define BTRFS_QGROUP_STATUS_FLAG_RESCAN (1ULL << 1)
1119/*
1120 * Some qgroup entries are known to be out of date,
1121 * either because the configuration has changed in a way that
1122 * makes a rescan necessary, or because the fs has been mounted
1123 * with a non-qgroup-aware version.
1124 * Turning qouta off and on again makes it inconsistent, too.
1125 */
1126#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT (1ULL << 2)
1127
1128#define BTRFS_QGROUP_STATUS_VERSION 1
1129
1130struct btrfs_qgroup_status_item {
1131 __le64 version;
1132 /*
1133 * the generation is updated during every commit. As older
1134 * versions of btrfs are not aware of qgroups, it will be
1135 * possible to detect inconsistencies by checking the
1136 * generation on mount time
1137 */
1138 __le64 generation;
1139
1140 /* flag definitions see above */
1141 __le64 flags;
1142
1143 /*
1144 * only used during scanning to record the progress
1145 * of the scan. It contains a logical address
1146 */
1147 __le64 rescan;
1148} __attribute__ ((__packed__));
1149
1150struct btrfs_qgroup_info_item {
1151 __le64 generation;
1152 __le64 rfer;
1153 __le64 rfer_cmpr;
1154 __le64 excl;
1155 __le64 excl_cmpr;
1156} __attribute__ ((__packed__));
1157
1158/* flags definition for qgroup limits */
1159#define BTRFS_QGROUP_LIMIT_MAX_RFER (1ULL << 0)
1160#define BTRFS_QGROUP_LIMIT_MAX_EXCL (1ULL << 1)
1161#define BTRFS_QGROUP_LIMIT_RSV_RFER (1ULL << 2)
1162#define BTRFS_QGROUP_LIMIT_RSV_EXCL (1ULL << 3)
1163#define BTRFS_QGROUP_LIMIT_RFER_CMPR (1ULL << 4)
1164#define BTRFS_QGROUP_LIMIT_EXCL_CMPR (1ULL << 5)
1165
1166struct btrfs_qgroup_limit_item {
1167 /*
1168 * only updated when any of the other values change
1169 */
1170 __le64 flags;
1171 __le64 max_rfer;
1172 __le64 max_excl;
1173 __le64 rsv_rfer;
1174 __le64 rsv_excl;
1175} __attribute__ ((__packed__));
1176
1177/* For raid type sysfs entries */ 387/* For raid type sysfs entries */
1178struct raid_kobject { 388struct raid_kobject {
1179 int raid_type; 389 int raid_type;
@@ -1408,6 +618,27 @@ struct btrfs_block_group_cache {
1408 618
1409 struct btrfs_io_ctl io_ctl; 619 struct btrfs_io_ctl io_ctl;
1410 620
621 /*
622 * Incremented when doing extent allocations and holding a read lock
623 * on the space_info's groups_sem semaphore.
624 * Decremented when an ordered extent that represents an IO against this
625 * block group's range is created (after it's added to its inode's
626 * root's list of ordered extents) or immediately after the allocation
627 * if it's a metadata extent or fallocate extent (for these cases we
628 * don't create ordered extents).
629 */
630 atomic_t reservations;
631
632 /*
633 * Incremented while holding the spinlock *lock* by a task checking if
634 * it can perform a nocow write (incremented if the value for the *ro*
635 * field is 0). Decremented by such tasks once they create an ordered
636 * extent or before that if some error happens before reaching that step.
637 * This is to prevent races between block group relocation and nocow
638 * writes through direct IO.
639 */
640 atomic_t nocow_writers;
641
1411 /* Lock for free space tree operations. */ 642 /* Lock for free space tree operations. */
1412 struct mutex free_space_lock; 643 struct mutex free_space_lock;
1413 644
@@ -2026,228 +1257,6 @@ struct btrfs_root {
2026 atomic_t qgroup_meta_rsv; 1257 atomic_t qgroup_meta_rsv;
2027}; 1258};
2028 1259
2029struct btrfs_ioctl_defrag_range_args {
2030 /* start of the defrag operation */
2031 __u64 start;
2032
2033 /* number of bytes to defrag, use (u64)-1 to say all */
2034 __u64 len;
2035
2036 /*
2037 * flags for the operation, which can include turning
2038 * on compression for this one defrag
2039 */
2040 __u64 flags;
2041
2042 /*
2043 * any extent bigger than this will be considered
2044 * already defragged. Use 0 to take the kernel default
2045 * Use 1 to say every single extent must be rewritten
2046 */
2047 __u32 extent_thresh;
2048
2049 /*
2050 * which compression method to use if turning on compression
2051 * for this defrag operation. If unspecified, zlib will
2052 * be used
2053 */
2054 __u32 compress_type;
2055
2056 /* spare for later */
2057 __u32 unused[4];
2058};
2059
2060
2061/*
2062 * inode items have the data typically returned from stat and store other
2063 * info about object characteristics. There is one for every file and dir in
2064 * the FS
2065 */
2066#define BTRFS_INODE_ITEM_KEY 1
2067#define BTRFS_INODE_REF_KEY 12
2068#define BTRFS_INODE_EXTREF_KEY 13
2069#define BTRFS_XATTR_ITEM_KEY 24
2070#define BTRFS_ORPHAN_ITEM_KEY 48
2071/* reserve 2-15 close to the inode for later flexibility */
2072
2073/*
2074 * dir items are the name -> inode pointers in a directory. There is one
2075 * for every name in a directory.
2076 */
2077#define BTRFS_DIR_LOG_ITEM_KEY 60
2078#define BTRFS_DIR_LOG_INDEX_KEY 72
2079#define BTRFS_DIR_ITEM_KEY 84
2080#define BTRFS_DIR_INDEX_KEY 96
2081/*
2082 * extent data is for file data
2083 */
2084#define BTRFS_EXTENT_DATA_KEY 108
2085
2086/*
2087 * extent csums are stored in a separate tree and hold csums for
2088 * an entire extent on disk.
2089 */
2090#define BTRFS_EXTENT_CSUM_KEY 128
2091
2092/*
2093 * root items point to tree roots. They are typically in the root
2094 * tree used by the super block to find all the other trees
2095 */
2096#define BTRFS_ROOT_ITEM_KEY 132
2097
2098/*
2099 * root backrefs tie subvols and snapshots to the directory entries that
2100 * reference them
2101 */
2102#define BTRFS_ROOT_BACKREF_KEY 144
2103
2104/*
2105 * root refs make a fast index for listing all of the snapshots and
2106 * subvolumes referenced by a given root. They point directly to the
2107 * directory item in the root that references the subvol
2108 */
2109#define BTRFS_ROOT_REF_KEY 156
2110
2111/*
2112 * extent items are in the extent map tree. These record which blocks
2113 * are used, and how many references there are to each block
2114 */
2115#define BTRFS_EXTENT_ITEM_KEY 168
2116
2117/*
2118 * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
2119 * the length, so we save the level in key->offset instead of the length.
2120 */
2121#define BTRFS_METADATA_ITEM_KEY 169
2122
2123#define BTRFS_TREE_BLOCK_REF_KEY 176
2124
2125#define BTRFS_EXTENT_DATA_REF_KEY 178
2126
2127#define BTRFS_EXTENT_REF_V0_KEY 180
2128
2129#define BTRFS_SHARED_BLOCK_REF_KEY 182
2130
2131#define BTRFS_SHARED_DATA_REF_KEY 184
2132
2133/*
2134 * block groups give us hints into the extent allocation trees. Which
2135 * blocks are free etc etc
2136 */
2137#define BTRFS_BLOCK_GROUP_ITEM_KEY 192
2138
2139/*
2140 * Every block group is represented in the free space tree by a free space info
2141 * item, which stores some accounting information. It is keyed on
2142 * (block_group_start, FREE_SPACE_INFO, block_group_length).
2143 */
2144#define BTRFS_FREE_SPACE_INFO_KEY 198
2145
2146/*
2147 * A free space extent tracks an extent of space that is free in a block group.
2148 * It is keyed on (start, FREE_SPACE_EXTENT, length).
2149 */
2150#define BTRFS_FREE_SPACE_EXTENT_KEY 199
2151
2152/*
2153 * When a block group becomes very fragmented, we convert it to use bitmaps
2154 * instead of extents. A free space bitmap is keyed on
2155 * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
2156 * (length / sectorsize) bits.
2157 */
2158#define BTRFS_FREE_SPACE_BITMAP_KEY 200
2159
2160#define BTRFS_DEV_EXTENT_KEY 204
2161#define BTRFS_DEV_ITEM_KEY 216
2162#define BTRFS_CHUNK_ITEM_KEY 228
2163
2164/*
2165 * Records the overall state of the qgroups.
2166 * There's only one instance of this key present,
2167 * (0, BTRFS_QGROUP_STATUS_KEY, 0)
2168 */
2169#define BTRFS_QGROUP_STATUS_KEY 240
2170/*
2171 * Records the currently used space of the qgroup.
2172 * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
2173 */
2174#define BTRFS_QGROUP_INFO_KEY 242
2175/*
2176 * Contains the user configured limits for the qgroup.
2177 * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
2178 */
2179#define BTRFS_QGROUP_LIMIT_KEY 244
2180/*
2181 * Records the child-parent relationship of qgroups. For
2182 * each relation, 2 keys are present:
2183 * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
2184 * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
2185 */
2186#define BTRFS_QGROUP_RELATION_KEY 246
2187
2188/*
2189 * Obsolete name, see BTRFS_TEMPORARY_ITEM_KEY.
2190 */
2191#define BTRFS_BALANCE_ITEM_KEY 248
2192
2193/*
2194 * The key type for tree items that are stored persistently, but do not need to
2195 * exist for extended period of time. The items can exist in any tree.
2196 *
2197 * [subtype, BTRFS_TEMPORARY_ITEM_KEY, data]
2198 *
2199 * Existing items:
2200 *
2201 * - balance status item
2202 * (BTRFS_BALANCE_OBJECTID, BTRFS_TEMPORARY_ITEM_KEY, 0)
2203 */
2204#define BTRFS_TEMPORARY_ITEM_KEY 248
2205
2206/*
2207 * Obsolete name, see BTRFS_PERSISTENT_ITEM_KEY
2208 */
2209#define BTRFS_DEV_STATS_KEY 249
2210
2211/*
2212 * The key type for tree items that are stored persistently and usually exist
2213 * for a long period, eg. filesystem lifetime. The item kinds can be status
2214 * information, stats or preference values. The item can exist in any tree.
2215 *
2216 * [subtype, BTRFS_PERSISTENT_ITEM_KEY, data]
2217 *
2218 * Existing items:
2219 *
2220 * - device statistics, store IO stats in the device tree, one key for all
2221 * stats
2222 * (BTRFS_DEV_STATS_OBJECTID, BTRFS_DEV_STATS_KEY, 0)
2223 */
2224#define BTRFS_PERSISTENT_ITEM_KEY 249
2225
2226/*
2227 * Persistantly stores the device replace state in the device tree.
2228 * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
2229 */
2230#define BTRFS_DEV_REPLACE_KEY 250
2231
2232/*
2233 * Stores items that allow to quickly map UUIDs to something else.
2234 * These items are part of the filesystem UUID tree.
2235 * The key is built like this:
2236 * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
2237 */
2238#if BTRFS_UUID_SIZE != 16
2239#error "UUID items require BTRFS_UUID_SIZE == 16!"
2240#endif
2241#define BTRFS_UUID_KEY_SUBVOL 251 /* for UUIDs assigned to subvols */
2242#define BTRFS_UUID_KEY_RECEIVED_SUBVOL 252 /* for UUIDs assigned to
2243 * received subvols */
2244
2245/*
2246 * string items are for debugging. They just store a short string of
2247 * data in the FS
2248 */
2249#define BTRFS_STRING_ITEM_KEY 253
2250
2251/* 1260/*
2252 * Flags for mount options. 1261 * Flags for mount options.
2253 * 1262 *
@@ -3499,6 +2508,12 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
3499 struct btrfs_root *root); 2508 struct btrfs_root *root);
3500int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, 2509int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
3501 struct btrfs_root *root); 2510 struct btrfs_root *root);
2511void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
2512 const u64 start);
2513void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
2514bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
2515void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
2516void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg);
3502void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2517void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
3503int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2518int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
3504 struct btrfs_root *root, unsigned long count); 2519 struct btrfs_root *root, unsigned long count);
@@ -4122,6 +3137,7 @@ void btrfs_test_inode_set_ops(struct inode *inode);
4122 3137
4123/* ioctl.c */ 3138/* ioctl.c */
4124long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 3139long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
3140long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
4125int btrfs_ioctl_get_supported_features(void __user *arg); 3141int btrfs_ioctl_get_supported_features(void __user *arg);
4126void btrfs_update_iflags(struct inode *inode); 3142void btrfs_update_iflags(struct inode *inode);
4127void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 3143void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
@@ -4326,10 +3342,9 @@ static inline void assfail(char *expr, char *file, int line)
4326#define ASSERT(expr) ((void)0) 3342#define ASSERT(expr) ((void)0)
4327#endif 3343#endif
4328 3344
4329#define btrfs_assert()
4330__printf(5, 6) 3345__printf(5, 6)
4331__cold 3346__cold
4332void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, 3347void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
4333 unsigned int line, int errno, const char *fmt, ...); 3348 unsigned int line, int errno, const char *fmt, ...);
4334 3349
4335const char *btrfs_decode_error(int errno); 3350const char *btrfs_decode_error(int errno);
@@ -4339,6 +3354,46 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
4339 struct btrfs_root *root, const char *function, 3354 struct btrfs_root *root, const char *function,
4340 unsigned int line, int errno); 3355 unsigned int line, int errno);
4341 3356
3357/*
3358 * Call btrfs_abort_transaction as early as possible when an error condition is
3359 * detected, that way the exact line number is reported.
3360 */
3361#define btrfs_abort_transaction(trans, root, errno) \
3362do { \
3363 /* Report first abort since mount */ \
3364 if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
3365 &((root)->fs_info->fs_state))) { \
3366 WARN(1, KERN_DEBUG \
3367 "BTRFS: Transaction aborted (error %d)\n", \
3368 (errno)); \
3369 } \
3370 __btrfs_abort_transaction((trans), (root), __func__, \
3371 __LINE__, (errno)); \
3372} while (0)
3373
3374#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
3375do { \
3376 __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
3377 (errno), fmt, ##args); \
3378} while (0)
3379
3380__printf(5, 6)
3381__cold
3382void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
3383 unsigned int line, int errno, const char *fmt, ...);
3384/*
3385 * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
3386 * will panic(). Otherwise we BUG() here.
3387 */
3388#define btrfs_panic(fs_info, errno, fmt, args...) \
3389do { \
3390 __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
3391 BUG(); \
3392} while (0)
3393
3394
3395/* compatibility and incompatibility defines */
3396
4342#define btrfs_set_fs_incompat(__fs_info, opt) \ 3397#define btrfs_set_fs_incompat(__fs_info, opt) \
4343 __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) 3398 __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
4344 3399
@@ -4455,44 +3510,6 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
4455 return !!(btrfs_super_compat_ro_flags(disk_super) & flag); 3510 return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
4456} 3511}
4457 3512
4458/*
4459 * Call btrfs_abort_transaction as early as possible when an error condition is
4460 * detected, that way the exact line number is reported.
4461 */
4462#define btrfs_abort_transaction(trans, root, errno) \
4463do { \
4464 /* Report first abort since mount */ \
4465 if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
4466 &((root)->fs_info->fs_state))) { \
4467 WARN(1, KERN_DEBUG \
4468 "BTRFS: Transaction aborted (error %d)\n", \
4469 (errno)); \
4470 } \
4471 __btrfs_abort_transaction((trans), (root), __func__, \
4472 __LINE__, (errno)); \
4473} while (0)
4474
4475#define btrfs_std_error(fs_info, errno, fmt, args...) \
4476do { \
4477 __btrfs_std_error((fs_info), __func__, __LINE__, \
4478 (errno), fmt, ##args); \
4479} while (0)
4480
4481__printf(5, 6)
4482__cold
4483void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
4484 unsigned int line, int errno, const char *fmt, ...);
4485
4486/*
4487 * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
4488 * will panic(). Otherwise we BUG() here.
4489 */
4490#define btrfs_panic(fs_info, errno, fmt, args...) \
4491do { \
4492 __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
4493 BUG(); \
4494} while (0)
4495
4496/* acl.c */ 3513/* acl.c */
4497#ifdef CONFIG_BTRFS_FS_POSIX_ACL 3514#ifdef CONFIG_BTRFS_FS_POSIX_ACL
4498struct posix_acl *btrfs_get_acl(struct inode *inode, int type); 3515struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6cef0062f929..61561c2a3f96 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -134,7 +134,7 @@ again:
134 /* cached in the btrfs inode and can be accessed */ 134 /* cached in the btrfs inode and can be accessed */
135 atomic_add(2, &node->refs); 135 atomic_add(2, &node->refs);
136 136
137 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 137 ret = radix_tree_preload(GFP_NOFS);
138 if (ret) { 138 if (ret) {
139 kmem_cache_free(delayed_node_cache, node); 139 kmem_cache_free(delayed_node_cache, node);
140 return ERR_PTR(ret); 140 return ERR_PTR(ret);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 26bcb487f958..85f12e6e28d2 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -44,9 +44,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
44 struct btrfs_fs_info *fs_info, 44 struct btrfs_fs_info *fs_info,
45 struct btrfs_device *srcdev, 45 struct btrfs_device *srcdev,
46 struct btrfs_device *tgtdev); 46 struct btrfs_device *tgtdev);
47static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
48 char *srcdev_name,
49 struct btrfs_device **device);
50static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); 47static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
51static int btrfs_dev_replace_kthread(void *data); 48static int btrfs_dev_replace_kthread(void *data);
52static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info); 49static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
@@ -305,8 +302,8 @@ void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
305 dev_replace->cursor_left_last_write_of_item; 302 dev_replace->cursor_left_last_write_of_item;
306} 303}
307 304
308int btrfs_dev_replace_start(struct btrfs_root *root, 305int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
309 struct btrfs_ioctl_dev_replace_args *args) 306 u64 srcdevid, char *srcdev_name, int read_src)
310{ 307{
311 struct btrfs_trans_handle *trans; 308 struct btrfs_trans_handle *trans;
312 struct btrfs_fs_info *fs_info = root->fs_info; 309 struct btrfs_fs_info *fs_info = root->fs_info;
@@ -315,29 +312,16 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
315 struct btrfs_device *tgt_device = NULL; 312 struct btrfs_device *tgt_device = NULL;
316 struct btrfs_device *src_device = NULL; 313 struct btrfs_device *src_device = NULL;
317 314
318 switch (args->start.cont_reading_from_srcdev_mode) {
319 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
320 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
321 break;
322 default:
323 return -EINVAL;
324 }
325
326 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
327 args->start.tgtdev_name[0] == '\0')
328 return -EINVAL;
329
330 /* the disk copy procedure reuses the scrub code */ 315 /* the disk copy procedure reuses the scrub code */
331 mutex_lock(&fs_info->volume_mutex); 316 mutex_lock(&fs_info->volume_mutex);
332 ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, 317 ret = btrfs_find_device_by_devspec(root, srcdevid,
333 args->start.srcdev_name, 318 srcdev_name, &src_device);
334 &src_device);
335 if (ret) { 319 if (ret) {
336 mutex_unlock(&fs_info->volume_mutex); 320 mutex_unlock(&fs_info->volume_mutex);
337 return ret; 321 return ret;
338 } 322 }
339 323
340 ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, 324 ret = btrfs_init_dev_replace_tgtdev(root, tgtdev_name,
341 src_device, &tgt_device); 325 src_device, &tgt_device);
342 mutex_unlock(&fs_info->volume_mutex); 326 mutex_unlock(&fs_info->volume_mutex);
343 if (ret) 327 if (ret)
@@ -364,18 +348,17 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
364 break; 348 break;
365 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: 349 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
366 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: 350 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
367 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; 351 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
368 goto leave; 352 goto leave;
369 } 353 }
370 354
371 dev_replace->cont_reading_from_srcdev_mode = 355 dev_replace->cont_reading_from_srcdev_mode = read_src;
372 args->start.cont_reading_from_srcdev_mode;
373 WARN_ON(!src_device); 356 WARN_ON(!src_device);
374 dev_replace->srcdev = src_device; 357 dev_replace->srcdev = src_device;
375 WARN_ON(!tgt_device); 358 WARN_ON(!tgt_device);
376 dev_replace->tgtdev = tgt_device; 359 dev_replace->tgtdev = tgt_device;
377 360
378 btrfs_info_in_rcu(root->fs_info, 361 btrfs_info_in_rcu(fs_info,
379 "dev_replace from %s (devid %llu) to %s started", 362 "dev_replace from %s (devid %llu) to %s started",
380 src_device->missing ? "<missing disk>" : 363 src_device->missing ? "<missing disk>" :
381 rcu_str_deref(src_device->name), 364 rcu_str_deref(src_device->name),
@@ -396,14 +379,13 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
396 dev_replace->item_needs_writeback = 1; 379 dev_replace->item_needs_writeback = 1;
397 atomic64_set(&dev_replace->num_write_errors, 0); 380 atomic64_set(&dev_replace->num_write_errors, 0);
398 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); 381 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
399 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
400 btrfs_dev_replace_unlock(dev_replace, 1); 382 btrfs_dev_replace_unlock(dev_replace, 1);
401 383
402 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device); 384 ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
403 if (ret) 385 if (ret)
404 btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); 386 btrfs_err(fs_info, "kobj add dev failed %d\n", ret);
405 387
406 btrfs_wait_ordered_roots(root->fs_info, -1); 388 btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
407 389
408 /* force writing the updated state information to disk */ 390 /* force writing the updated state information to disk */
409 trans = btrfs_start_transaction(root, 0); 391 trans = btrfs_start_transaction(root, 0);
@@ -421,11 +403,9 @@ int btrfs_dev_replace_start(struct btrfs_root *root,
421 btrfs_device_get_total_bytes(src_device), 403 btrfs_device_get_total_bytes(src_device),
422 &dev_replace->scrub_progress, 0, 1); 404 &dev_replace->scrub_progress, 0, 1);
423 405
424 ret = btrfs_dev_replace_finishing(root->fs_info, ret); 406 ret = btrfs_dev_replace_finishing(fs_info, ret);
425 /* don't warn if EINPROGRESS, someone else might be running scrub */
426 if (ret == -EINPROGRESS) { 407 if (ret == -EINPROGRESS) {
427 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; 408 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
428 ret = 0;
429 } else { 409 } else {
430 WARN_ON(ret); 410 WARN_ON(ret);
431 } 411 }
@@ -440,6 +420,35 @@ leave:
440 return ret; 420 return ret;
441} 421}
442 422
423int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
424 struct btrfs_ioctl_dev_replace_args *args)
425{
426 int ret;
427
428 switch (args->start.cont_reading_from_srcdev_mode) {
429 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
430 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
431 break;
432 default:
433 return -EINVAL;
434 }
435
436 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
437 args->start.tgtdev_name[0] == '\0')
438 return -EINVAL;
439
440 ret = btrfs_dev_replace_start(root, args->start.tgtdev_name,
441 args->start.srcdevid,
442 args->start.srcdev_name,
443 args->start.cont_reading_from_srcdev_mode);
444 args->result = ret;
445 /* don't warn if EINPROGRESS, someone else might be running scrub */
446 if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
447 ret = 0;
448
449 return ret;
450}
451
443/* 452/*
444 * blocked until all flighting bios are finished. 453 * blocked until all flighting bios are finished.
445 */ 454 */
@@ -495,7 +504,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
495 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); 504 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
496 return ret; 505 return ret;
497 } 506 }
498 btrfs_wait_ordered_roots(root->fs_info, -1); 507 btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
499 508
500 trans = btrfs_start_transaction(root, 0); 509 trans = btrfs_start_transaction(root, 0);
501 if (IS_ERR(trans)) { 510 if (IS_ERR(trans)) {
@@ -560,10 +569,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
560 ASSERT(list_empty(&src_device->resized_list)); 569 ASSERT(list_empty(&src_device->resized_list));
561 tgt_device->commit_total_bytes = src_device->commit_total_bytes; 570 tgt_device->commit_total_bytes = src_device->commit_total_bytes;
562 tgt_device->commit_bytes_used = src_device->bytes_used; 571 tgt_device->commit_bytes_used = src_device->bytes_used;
563 if (fs_info->sb->s_bdev == src_device->bdev) 572
564 fs_info->sb->s_bdev = tgt_device->bdev; 573 btrfs_assign_next_active_device(fs_info, src_device, tgt_device);
565 if (fs_info->fs_devices->latest_bdev == src_device->bdev) 574
566 fs_info->fs_devices->latest_bdev = tgt_device->bdev;
567 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); 575 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
568 fs_info->fs_devices->rw_devices++; 576 fs_info->fs_devices->rw_devices++;
569 577
@@ -626,25 +634,6 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
626 write_unlock(&em_tree->lock); 634 write_unlock(&em_tree->lock);
627} 635}
628 636
629static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
630 char *srcdev_name,
631 struct btrfs_device **device)
632{
633 int ret;
634
635 if (srcdevid) {
636 ret = 0;
637 *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
638 NULL);
639 if (!*device)
640 ret = -ENOENT;
641 } else {
642 ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
643 device);
644 }
645 return ret;
646}
647
648void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 637void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
649 struct btrfs_ioctl_dev_replace_args *args) 638 struct btrfs_ioctl_dev_replace_args *args)
650{ 639{
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index 29e3ef5f96bd..e922b42d91df 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -25,8 +25,10 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
25int btrfs_run_dev_replace(struct btrfs_trans_handle *trans, 25int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
26 struct btrfs_fs_info *fs_info); 26 struct btrfs_fs_info *fs_info);
27void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info); 27void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
28int btrfs_dev_replace_start(struct btrfs_root *root, 28int btrfs_dev_replace_by_ioctl(struct btrfs_root *root,
29 struct btrfs_ioctl_dev_replace_args *args); 29 struct btrfs_ioctl_dev_replace_args *args);
30int btrfs_dev_replace_start(struct btrfs_root *root, char *tgtdev_name,
31 u64 srcdevid, char *srcdev_name, int read_src);
30void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, 32void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
31 struct btrfs_ioctl_dev_replace_args *args); 33 struct btrfs_ioctl_dev_replace_args *args);
32int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info, 34int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4e47849d7427..91d123938cef 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1640,7 +1640,7 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
1640{ 1640{
1641 int ret; 1641 int ret;
1642 1642
1643 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1643 ret = radix_tree_preload(GFP_NOFS);
1644 if (ret) 1644 if (ret)
1645 return ret; 1645 return ret;
1646 1646
@@ -2417,7 +2417,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
2417 /* returns with log_tree_root freed on success */ 2417 /* returns with log_tree_root freed on success */
2418 ret = btrfs_recover_log_trees(log_tree_root); 2418 ret = btrfs_recover_log_trees(log_tree_root);
2419 if (ret) { 2419 if (ret) {
2420 btrfs_std_error(tree_root->fs_info, ret, 2420 btrfs_handle_fs_error(tree_root->fs_info, ret,
2421 "Failed to recover log tree"); 2421 "Failed to recover log tree");
2422 free_extent_buffer(log_tree_root->node); 2422 free_extent_buffer(log_tree_root->node);
2423 kfree(log_tree_root); 2423 kfree(log_tree_root);
@@ -2517,6 +2517,7 @@ int open_ctree(struct super_block *sb,
2517 int num_backups_tried = 0; 2517 int num_backups_tried = 0;
2518 int backup_index = 0; 2518 int backup_index = 0;
2519 int max_active; 2519 int max_active;
2520 bool cleaner_mutex_locked = false;
2520 2521
2521 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL); 2522 tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
2522 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL); 2523 chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info, GFP_KERNEL);
@@ -2713,7 +2714,7 @@ int open_ctree(struct super_block *sb,
2713 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k). 2714 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
2714 */ 2715 */
2715 if (btrfs_check_super_csum(bh->b_data)) { 2716 if (btrfs_check_super_csum(bh->b_data)) {
2716 printk(KERN_ERR "BTRFS: superblock checksum mismatch\n"); 2717 btrfs_err(fs_info, "superblock checksum mismatch");
2717 err = -EINVAL; 2718 err = -EINVAL;
2718 brelse(bh); 2719 brelse(bh);
2719 goto fail_alloc; 2720 goto fail_alloc;
@@ -2733,7 +2734,7 @@ int open_ctree(struct super_block *sb,
2733 2734
2734 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2735 ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
2735 if (ret) { 2736 if (ret) {
2736 printk(KERN_ERR "BTRFS: superblock contains fatal errors\n"); 2737 btrfs_err(fs_info, "superblock contains fatal errors");
2737 err = -EINVAL; 2738 err = -EINVAL;
2738 goto fail_alloc; 2739 goto fail_alloc;
2739 } 2740 }
@@ -2768,9 +2769,9 @@ int open_ctree(struct super_block *sb,
2768 features = btrfs_super_incompat_flags(disk_super) & 2769 features = btrfs_super_incompat_flags(disk_super) &
2769 ~BTRFS_FEATURE_INCOMPAT_SUPP; 2770 ~BTRFS_FEATURE_INCOMPAT_SUPP;
2770 if (features) { 2771 if (features) {
2771 printk(KERN_ERR "BTRFS: couldn't mount because of " 2772 btrfs_err(fs_info,
2772 "unsupported optional features (%Lx).\n", 2773 "cannot mount because of unsupported optional features (%llx)",
2773 features); 2774 features);
2774 err = -EINVAL; 2775 err = -EINVAL;
2775 goto fail_alloc; 2776 goto fail_alloc;
2776 } 2777 }
@@ -2781,7 +2782,7 @@ int open_ctree(struct super_block *sb,
2781 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 2782 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
2782 2783
2783 if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) 2784 if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
2784 printk(KERN_INFO "BTRFS: has skinny extents\n"); 2785 btrfs_info(fs_info, "has skinny extents");
2785 2786
2786 /* 2787 /*
2787 * flag our filesystem as having big metadata blocks if 2788 * flag our filesystem as having big metadata blocks if
@@ -2789,7 +2790,8 @@ int open_ctree(struct super_block *sb,
2789 */ 2790 */
2790 if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { 2791 if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
2791 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) 2792 if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
2792 printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); 2793 btrfs_info(fs_info,
2794 "flagging fs with big metadata feature");
2793 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; 2795 features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
2794 } 2796 }
2795 2797
@@ -2805,9 +2807,9 @@ int open_ctree(struct super_block *sb,
2805 */ 2807 */
2806 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && 2808 if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
2807 (sectorsize != nodesize)) { 2809 (sectorsize != nodesize)) {
2808 printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes " 2810 btrfs_err(fs_info,
2809 "are not allowed for mixed block groups on %s\n", 2811"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
2810 sb->s_id); 2812 nodesize, sectorsize);
2811 goto fail_alloc; 2813 goto fail_alloc;
2812 } 2814 }
2813 2815
@@ -2820,8 +2822,8 @@ int open_ctree(struct super_block *sb,
2820 features = btrfs_super_compat_ro_flags(disk_super) & 2822 features = btrfs_super_compat_ro_flags(disk_super) &
2821 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 2823 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
2822 if (!(sb->s_flags & MS_RDONLY) && features) { 2824 if (!(sb->s_flags & MS_RDONLY) && features) {
2823 printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " 2825 btrfs_err(fs_info,
2824 "unsupported option features (%Lx).\n", 2826 "cannot mount read-write because of unsupported optional features (%llx)",
2825 features); 2827 features);
2826 err = -EINVAL; 2828 err = -EINVAL;
2827 goto fail_alloc; 2829 goto fail_alloc;
@@ -2850,8 +2852,7 @@ int open_ctree(struct super_block *sb,
2850 ret = btrfs_read_sys_array(tree_root); 2852 ret = btrfs_read_sys_array(tree_root);
2851 mutex_unlock(&fs_info->chunk_mutex); 2853 mutex_unlock(&fs_info->chunk_mutex);
2852 if (ret) { 2854 if (ret) {
2853 printk(KERN_ERR "BTRFS: failed to read the system " 2855 btrfs_err(fs_info, "failed to read the system array: %d", ret);
2854 "array on %s\n", sb->s_id);
2855 goto fail_sb_buffer; 2856 goto fail_sb_buffer;
2856 } 2857 }
2857 2858
@@ -2865,8 +2866,7 @@ int open_ctree(struct super_block *sb,
2865 generation); 2866 generation);
2866 if (IS_ERR(chunk_root->node) || 2867 if (IS_ERR(chunk_root->node) ||
2867 !extent_buffer_uptodate(chunk_root->node)) { 2868 !extent_buffer_uptodate(chunk_root->node)) {
2868 printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", 2869 btrfs_err(fs_info, "failed to read chunk root");
2869 sb->s_id);
2870 if (!IS_ERR(chunk_root->node)) 2870 if (!IS_ERR(chunk_root->node))
2871 free_extent_buffer(chunk_root->node); 2871 free_extent_buffer(chunk_root->node);
2872 chunk_root->node = NULL; 2872 chunk_root->node = NULL;
@@ -2880,8 +2880,7 @@ int open_ctree(struct super_block *sb,
2880 2880
2881 ret = btrfs_read_chunk_tree(chunk_root); 2881 ret = btrfs_read_chunk_tree(chunk_root);
2882 if (ret) { 2882 if (ret) {
2883 printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n", 2883 btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
2884 sb->s_id);
2885 goto fail_tree_roots; 2884 goto fail_tree_roots;
2886 } 2885 }
2887 2886
@@ -2892,8 +2891,7 @@ int open_ctree(struct super_block *sb,
2892 btrfs_close_extra_devices(fs_devices, 0); 2891 btrfs_close_extra_devices(fs_devices, 0);
2893 2892
2894 if (!fs_devices->latest_bdev) { 2893 if (!fs_devices->latest_bdev) {
2895 printk(KERN_ERR "BTRFS: failed to read devices on %s\n", 2894 btrfs_err(fs_info, "failed to read devices");
2896 sb->s_id);
2897 goto fail_tree_roots; 2895 goto fail_tree_roots;
2898 } 2896 }
2899 2897
@@ -2905,8 +2903,7 @@ retry_root_backup:
2905 generation); 2903 generation);
2906 if (IS_ERR(tree_root->node) || 2904 if (IS_ERR(tree_root->node) ||
2907 !extent_buffer_uptodate(tree_root->node)) { 2905 !extent_buffer_uptodate(tree_root->node)) {
2908 printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", 2906 btrfs_warn(fs_info, "failed to read tree root");
2909 sb->s_id);
2910 if (!IS_ERR(tree_root->node)) 2907 if (!IS_ERR(tree_root->node))
2911 free_extent_buffer(tree_root->node); 2908 free_extent_buffer(tree_root->node);
2912 tree_root->node = NULL; 2909 tree_root->node = NULL;
@@ -2938,20 +2935,19 @@ retry_root_backup:
2938 2935
2939 ret = btrfs_recover_balance(fs_info); 2936 ret = btrfs_recover_balance(fs_info);
2940 if (ret) { 2937 if (ret) {
2941 printk(KERN_ERR "BTRFS: failed to recover balance\n"); 2938 btrfs_err(fs_info, "failed to recover balance: %d", ret);
2942 goto fail_block_groups; 2939 goto fail_block_groups;
2943 } 2940 }
2944 2941
2945 ret = btrfs_init_dev_stats(fs_info); 2942 ret = btrfs_init_dev_stats(fs_info);
2946 if (ret) { 2943 if (ret) {
2947 printk(KERN_ERR "BTRFS: failed to init dev_stats: %d\n", 2944 btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
2948 ret);
2949 goto fail_block_groups; 2945 goto fail_block_groups;
2950 } 2946 }
2951 2947
2952 ret = btrfs_init_dev_replace(fs_info); 2948 ret = btrfs_init_dev_replace(fs_info);
2953 if (ret) { 2949 if (ret) {
2954 pr_err("BTRFS: failed to init dev_replace: %d\n", ret); 2950 btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
2955 goto fail_block_groups; 2951 goto fail_block_groups;
2956 } 2952 }
2957 2953
@@ -2959,31 +2955,33 @@ retry_root_backup:
2959 2955
2960 ret = btrfs_sysfs_add_fsid(fs_devices, NULL); 2956 ret = btrfs_sysfs_add_fsid(fs_devices, NULL);
2961 if (ret) { 2957 if (ret) {
2962 pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); 2958 btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
2959 ret);
2963 goto fail_block_groups; 2960 goto fail_block_groups;
2964 } 2961 }
2965 2962
2966 ret = btrfs_sysfs_add_device(fs_devices); 2963 ret = btrfs_sysfs_add_device(fs_devices);
2967 if (ret) { 2964 if (ret) {
2968 pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); 2965 btrfs_err(fs_info, "failed to init sysfs device interface: %d",
2966 ret);
2969 goto fail_fsdev_sysfs; 2967 goto fail_fsdev_sysfs;
2970 } 2968 }
2971 2969
2972 ret = btrfs_sysfs_add_mounted(fs_info); 2970 ret = btrfs_sysfs_add_mounted(fs_info);
2973 if (ret) { 2971 if (ret) {
2974 pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); 2972 btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
2975 goto fail_fsdev_sysfs; 2973 goto fail_fsdev_sysfs;
2976 } 2974 }
2977 2975
2978 ret = btrfs_init_space_info(fs_info); 2976 ret = btrfs_init_space_info(fs_info);
2979 if (ret) { 2977 if (ret) {
2980 printk(KERN_ERR "BTRFS: Failed to initial space info: %d\n", ret); 2978 btrfs_err(fs_info, "failed to initialize space info: %d", ret);
2981 goto fail_sysfs; 2979 goto fail_sysfs;
2982 } 2980 }
2983 2981
2984 ret = btrfs_read_block_groups(fs_info->extent_root); 2982 ret = btrfs_read_block_groups(fs_info->extent_root);
2985 if (ret) { 2983 if (ret) {
2986 printk(KERN_ERR "BTRFS: Failed to read block groups: %d\n", ret); 2984 btrfs_err(fs_info, "failed to read block groups: %d", ret);
2987 goto fail_sysfs; 2985 goto fail_sysfs;
2988 } 2986 }
2989 fs_info->num_tolerated_disk_barrier_failures = 2987 fs_info->num_tolerated_disk_barrier_failures =
@@ -2991,12 +2989,20 @@ retry_root_backup:
2991 if (fs_info->fs_devices->missing_devices > 2989 if (fs_info->fs_devices->missing_devices >
2992 fs_info->num_tolerated_disk_barrier_failures && 2990 fs_info->num_tolerated_disk_barrier_failures &&
2993 !(sb->s_flags & MS_RDONLY)) { 2991 !(sb->s_flags & MS_RDONLY)) {
2994 pr_warn("BTRFS: missing devices(%llu) exceeds the limit(%d), writeable mount is not allowed\n", 2992 btrfs_warn(fs_info,
2993"missing devices (%llu) exceeds the limit (%d), writeable mount is not allowed",
2995 fs_info->fs_devices->missing_devices, 2994 fs_info->fs_devices->missing_devices,
2996 fs_info->num_tolerated_disk_barrier_failures); 2995 fs_info->num_tolerated_disk_barrier_failures);
2997 goto fail_sysfs; 2996 goto fail_sysfs;
2998 } 2997 }
2999 2998
2999 /*
3000 * Hold the cleaner_mutex thread here so that we don't block
3001 * for a long time on btrfs_recover_relocation. cleaner_kthread
3002 * will wait for us to finish mounting the filesystem.
3003 */
3004 mutex_lock(&fs_info->cleaner_mutex);
3005 cleaner_mutex_locked = true;
3000 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 3006 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
3001 "btrfs-cleaner"); 3007 "btrfs-cleaner");
3002 if (IS_ERR(fs_info->cleaner_kthread)) 3008 if (IS_ERR(fs_info->cleaner_kthread))
@@ -3011,8 +3017,7 @@ retry_root_backup:
3011 if (!btrfs_test_opt(tree_root, SSD) && 3017 if (!btrfs_test_opt(tree_root, SSD) &&
3012 !btrfs_test_opt(tree_root, NOSSD) && 3018 !btrfs_test_opt(tree_root, NOSSD) &&
3013 !fs_info->fs_devices->rotating) { 3019 !fs_info->fs_devices->rotating) {
3014 printk(KERN_INFO "BTRFS: detected SSD devices, enabling SSD " 3020 btrfs_info(fs_info, "detected SSD devices, enabling SSD mode");
3015 "mode\n");
3016 btrfs_set_opt(fs_info->mount_opt, SSD); 3021 btrfs_set_opt(fs_info->mount_opt, SSD);
3017 } 3022 }
3018 3023
@@ -3030,8 +3035,9 @@ retry_root_backup:
3030 1 : 0, 3035 1 : 0,
3031 fs_info->check_integrity_print_mask); 3036 fs_info->check_integrity_print_mask);
3032 if (ret) 3037 if (ret)
3033 printk(KERN_WARNING "BTRFS: failed to initialize" 3038 btrfs_warn(fs_info,
3034 " integrity check module %s\n", sb->s_id); 3039 "failed to initialize integrity check module: %d",
3040 ret);
3035 } 3041 }
3036#endif 3042#endif
3037 ret = btrfs_read_qgroup_config(fs_info); 3043 ret = btrfs_read_qgroup_config(fs_info);
@@ -3056,17 +3062,17 @@ retry_root_backup:
3056 ret = btrfs_cleanup_fs_roots(fs_info); 3062 ret = btrfs_cleanup_fs_roots(fs_info);
3057 if (ret) 3063 if (ret)
3058 goto fail_qgroup; 3064 goto fail_qgroup;
3059 3065 /* We locked cleaner_mutex before creating cleaner_kthread. */
3060 mutex_lock(&fs_info->cleaner_mutex);
3061 ret = btrfs_recover_relocation(tree_root); 3066 ret = btrfs_recover_relocation(tree_root);
3062 mutex_unlock(&fs_info->cleaner_mutex);
3063 if (ret < 0) { 3067 if (ret < 0) {
3064 printk(KERN_WARNING 3068 btrfs_warn(fs_info, "failed to recover relocation: %d",
3065 "BTRFS: failed to recover relocation\n"); 3069 ret);
3066 err = -EINVAL; 3070 err = -EINVAL;
3067 goto fail_qgroup; 3071 goto fail_qgroup;
3068 } 3072 }
3069 } 3073 }
3074 mutex_unlock(&fs_info->cleaner_mutex);
3075 cleaner_mutex_locked = false;
3070 3076
3071 location.objectid = BTRFS_FS_TREE_OBJECTID; 3077 location.objectid = BTRFS_FS_TREE_OBJECTID;
3072 location.type = BTRFS_ROOT_ITEM_KEY; 3078 location.type = BTRFS_ROOT_ITEM_KEY;
@@ -3083,11 +3089,11 @@ retry_root_backup:
3083 3089
3084 if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && 3090 if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
3085 !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { 3091 !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3086 pr_info("BTRFS: creating free space tree\n"); 3092 btrfs_info(fs_info, "creating free space tree");
3087 ret = btrfs_create_free_space_tree(fs_info); 3093 ret = btrfs_create_free_space_tree(fs_info);
3088 if (ret) { 3094 if (ret) {
3089 pr_warn("BTRFS: failed to create free space tree %d\n", 3095 btrfs_warn(fs_info,
3090 ret); 3096 "failed to create free space tree: %d", ret);
3091 close_ctree(tree_root); 3097 close_ctree(tree_root);
3092 return ret; 3098 return ret;
3093 } 3099 }
@@ -3104,14 +3110,14 @@ retry_root_backup:
3104 3110
3105 ret = btrfs_resume_balance_async(fs_info); 3111 ret = btrfs_resume_balance_async(fs_info);
3106 if (ret) { 3112 if (ret) {
3107 printk(KERN_WARNING "BTRFS: failed to resume balance\n"); 3113 btrfs_warn(fs_info, "failed to resume balance: %d", ret);
3108 close_ctree(tree_root); 3114 close_ctree(tree_root);
3109 return ret; 3115 return ret;
3110 } 3116 }
3111 3117
3112 ret = btrfs_resume_dev_replace_async(fs_info); 3118 ret = btrfs_resume_dev_replace_async(fs_info);
3113 if (ret) { 3119 if (ret) {
3114 pr_warn("BTRFS: failed to resume dev_replace\n"); 3120 btrfs_warn(fs_info, "failed to resume device replace: %d", ret);
3115 close_ctree(tree_root); 3121 close_ctree(tree_root);
3116 return ret; 3122 return ret;
3117 } 3123 }
@@ -3120,33 +3126,33 @@ retry_root_backup:
3120 3126
3121 if (btrfs_test_opt(tree_root, CLEAR_CACHE) && 3127 if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
3122 btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { 3128 btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
3123 pr_info("BTRFS: clearing free space tree\n"); 3129 btrfs_info(fs_info, "clearing free space tree");
3124 ret = btrfs_clear_free_space_tree(fs_info); 3130 ret = btrfs_clear_free_space_tree(fs_info);
3125 if (ret) { 3131 if (ret) {
3126 pr_warn("BTRFS: failed to clear free space tree %d\n", 3132 btrfs_warn(fs_info,
3127 ret); 3133 "failed to clear free space tree: %d", ret);
3128 close_ctree(tree_root); 3134 close_ctree(tree_root);
3129 return ret; 3135 return ret;
3130 } 3136 }
3131 } 3137 }
3132 3138
3133 if (!fs_info->uuid_root) { 3139 if (!fs_info->uuid_root) {
3134 pr_info("BTRFS: creating UUID tree\n"); 3140 btrfs_info(fs_info, "creating UUID tree");
3135 ret = btrfs_create_uuid_tree(fs_info); 3141 ret = btrfs_create_uuid_tree(fs_info);
3136 if (ret) { 3142 if (ret) {
3137 pr_warn("BTRFS: failed to create the UUID tree %d\n", 3143 btrfs_warn(fs_info,
3138 ret); 3144 "failed to create the UUID tree: %d", ret);
3139 close_ctree(tree_root); 3145 close_ctree(tree_root);
3140 return ret; 3146 return ret;
3141 } 3147 }
3142 } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) || 3148 } else if (btrfs_test_opt(tree_root, RESCAN_UUID_TREE) ||
3143 fs_info->generation != 3149 fs_info->generation !=
3144 btrfs_super_uuid_tree_generation(disk_super)) { 3150 btrfs_super_uuid_tree_generation(disk_super)) {
3145 pr_info("BTRFS: checking UUID tree\n"); 3151 btrfs_info(fs_info, "checking UUID tree");
3146 ret = btrfs_check_uuid_tree(fs_info); 3152 ret = btrfs_check_uuid_tree(fs_info);
3147 if (ret) { 3153 if (ret) {
3148 pr_warn("BTRFS: failed to check the UUID tree %d\n", 3154 btrfs_warn(fs_info,
3149 ret); 3155 "failed to check the UUID tree: %d", ret);
3150 close_ctree(tree_root); 3156 close_ctree(tree_root);
3151 return ret; 3157 return ret;
3152 } 3158 }
@@ -3180,6 +3186,10 @@ fail_cleaner:
3180 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 3186 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
3181 3187
3182fail_sysfs: 3188fail_sysfs:
3189 if (cleaner_mutex_locked) {
3190 mutex_unlock(&fs_info->cleaner_mutex);
3191 cleaner_mutex_locked = false;
3192 }
3183 btrfs_sysfs_remove_mounted(fs_info); 3193 btrfs_sysfs_remove_mounted(fs_info);
3184 3194
3185fail_fsdev_sysfs: 3195fail_fsdev_sysfs:
@@ -3646,7 +3656,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3646 if (ret) { 3656 if (ret) {
3647 mutex_unlock( 3657 mutex_unlock(
3648 &root->fs_info->fs_devices->device_list_mutex); 3658 &root->fs_info->fs_devices->device_list_mutex);
3649 btrfs_std_error(root->fs_info, ret, 3659 btrfs_handle_fs_error(root->fs_info, ret,
3650 "errors while submitting device barriers."); 3660 "errors while submitting device barriers.");
3651 return ret; 3661 return ret;
3652 } 3662 }
@@ -3686,7 +3696,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3686 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3696 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3687 3697
3688 /* FUA is masked off if unsupported and can't be the reason */ 3698 /* FUA is masked off if unsupported and can't be the reason */
3689 btrfs_std_error(root->fs_info, -EIO, 3699 btrfs_handle_fs_error(root->fs_info, -EIO,
3690 "%d errors while writing supers", total_errors); 3700 "%d errors while writing supers", total_errors);
3691 return -EIO; 3701 return -EIO;
3692 } 3702 }
@@ -3704,7 +3714,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
3704 } 3714 }
3705 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 3715 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3706 if (total_errors > max_errors) { 3716 if (total_errors > max_errors) {
3707 btrfs_std_error(root->fs_info, -EIO, 3717 btrfs_handle_fs_error(root->fs_info, -EIO,
3708 "%d errors while writing supers", total_errors); 3718 "%d errors while writing supers", total_errors);
3709 return -EIO; 3719 return -EIO;
3710 } 3720 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 84e060eb0de8..9424864fd01a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3824,6 +3824,59 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3824 return readonly; 3824 return readonly;
3825} 3825}
3826 3826
3827bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3828{
3829 struct btrfs_block_group_cache *bg;
3830 bool ret = true;
3831
3832 bg = btrfs_lookup_block_group(fs_info, bytenr);
3833 if (!bg)
3834 return false;
3835
3836 spin_lock(&bg->lock);
3837 if (bg->ro)
3838 ret = false;
3839 else
3840 atomic_inc(&bg->nocow_writers);
3841 spin_unlock(&bg->lock);
3842
3843 /* no put on block group, done by btrfs_dec_nocow_writers */
3844 if (!ret)
3845 btrfs_put_block_group(bg);
3846
3847 return ret;
3848
3849}
3850
3851void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
3852{
3853 struct btrfs_block_group_cache *bg;
3854
3855 bg = btrfs_lookup_block_group(fs_info, bytenr);
3856 ASSERT(bg);
3857 if (atomic_dec_and_test(&bg->nocow_writers))
3858 wake_up_atomic_t(&bg->nocow_writers);
3859 /*
3860 * Once for our lookup and once for the lookup done by a previous call
3861 * to btrfs_inc_nocow_writers()
3862 */
3863 btrfs_put_block_group(bg);
3864 btrfs_put_block_group(bg);
3865}
3866
3867static int btrfs_wait_nocow_writers_atomic_t(atomic_t *a)
3868{
3869 schedule();
3870 return 0;
3871}
3872
3873void btrfs_wait_nocow_writers(struct btrfs_block_group_cache *bg)
3874{
3875 wait_on_atomic_t(&bg->nocow_writers,
3876 btrfs_wait_nocow_writers_atomic_t,
3877 TASK_UNINTERRUPTIBLE);
3878}
3879
3827static const char *alloc_name(u64 flags) 3880static const char *alloc_name(u64 flags)
3828{ 3881{
3829 switch (flags) { 3882 switch (flags) {
@@ -4141,7 +4194,7 @@ commit_trans:
4141 4194
4142 if (need_commit > 0) { 4195 if (need_commit > 0) {
4143 btrfs_start_delalloc_roots(fs_info, 0, -1); 4196 btrfs_start_delalloc_roots(fs_info, 0, -1);
4144 btrfs_wait_ordered_roots(fs_info, -1); 4197 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
4145 } 4198 }
4146 4199
4147 trans = btrfs_join_transaction(root); 4200 trans = btrfs_join_transaction(root);
@@ -4583,7 +4636,8 @@ static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
4583 */ 4636 */
4584 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items); 4637 btrfs_start_delalloc_roots(root->fs_info, 0, nr_items);
4585 if (!current->journal_info) 4638 if (!current->journal_info)
4586 btrfs_wait_ordered_roots(root->fs_info, nr_items); 4639 btrfs_wait_ordered_roots(root->fs_info, nr_items,
4640 0, (u64)-1);
4587 } 4641 }
4588} 4642}
4589 4643
@@ -4620,7 +4674,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4620 4674
4621 /* Calc the number of the pages we need flush for space reservation */ 4675 /* Calc the number of the pages we need flush for space reservation */
4622 items = calc_reclaim_items_nr(root, to_reclaim); 4676 items = calc_reclaim_items_nr(root, to_reclaim);
4623 to_reclaim = items * EXTENT_SIZE_PER_ITEM; 4677 to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
4624 4678
4625 trans = (struct btrfs_trans_handle *)current->journal_info; 4679 trans = (struct btrfs_trans_handle *)current->journal_info;
4626 block_rsv = &root->fs_info->delalloc_block_rsv; 4680 block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -4632,7 +4686,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
4632 if (trans) 4686 if (trans)
4633 return; 4687 return;
4634 if (wait_ordered) 4688 if (wait_ordered)
4635 btrfs_wait_ordered_roots(root->fs_info, items); 4689 btrfs_wait_ordered_roots(root->fs_info, items,
4690 0, (u64)-1);
4636 return; 4691 return;
4637 } 4692 }
4638 4693
@@ -4671,7 +4726,8 @@ skip_async:
4671 4726
4672 loops++; 4727 loops++;
4673 if (wait_ordered && !trans) { 4728 if (wait_ordered && !trans) {
4674 btrfs_wait_ordered_roots(root->fs_info, items); 4729 btrfs_wait_ordered_roots(root->fs_info, items,
4730 0, (u64)-1);
4675 } else { 4731 } else {
4676 time_left = schedule_timeout_killable(1); 4732 time_left = schedule_timeout_killable(1);
4677 if (time_left) 4733 if (time_left)
@@ -6172,6 +6228,57 @@ int btrfs_exclude_logged_extents(struct btrfs_root *log,
6172 return 0; 6228 return 0;
6173} 6229}
6174 6230
6231static void
6232btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
6233{
6234 atomic_inc(&bg->reservations);
6235}
6236
6237void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
6238 const u64 start)
6239{
6240 struct btrfs_block_group_cache *bg;
6241
6242 bg = btrfs_lookup_block_group(fs_info, start);
6243 ASSERT(bg);
6244 if (atomic_dec_and_test(&bg->reservations))
6245 wake_up_atomic_t(&bg->reservations);
6246 btrfs_put_block_group(bg);
6247}
6248
6249static int btrfs_wait_bg_reservations_atomic_t(atomic_t *a)
6250{
6251 schedule();
6252 return 0;
6253}
6254
6255void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg)
6256{
6257 struct btrfs_space_info *space_info = bg->space_info;
6258
6259 ASSERT(bg->ro);
6260
6261 if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
6262 return;
6263
6264 /*
6265 * Our block group is read only but before we set it to read only,
6266 * some task might have had allocated an extent from it already, but it
6267 * has not yet created a respective ordered extent (and added it to a
6268 * root's list of ordered extents).
6269 * Therefore wait for any task currently allocating extents, since the
6270 * block group's reservations counter is incremented while a read lock
6271 * on the groups' semaphore is held and decremented after releasing
6272 * the read access on that semaphore and creating the ordered extent.
6273 */
6274 down_write(&space_info->groups_sem);
6275 up_write(&space_info->groups_sem);
6276
6277 wait_on_atomic_t(&bg->reservations,
6278 btrfs_wait_bg_reservations_atomic_t,
6279 TASK_UNINTERRUPTIBLE);
6280}
6281
6175/** 6282/**
6176 * btrfs_update_reserved_bytes - update the block_group and space info counters 6283 * btrfs_update_reserved_bytes - update the block_group and space info counters
6177 * @cache: The cache we are manipulating 6284 * @cache: The cache we are manipulating
@@ -7025,36 +7132,35 @@ btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
7025 int delalloc) 7132 int delalloc)
7026{ 7133{
7027 struct btrfs_block_group_cache *used_bg = NULL; 7134 struct btrfs_block_group_cache *used_bg = NULL;
7028 bool locked = false; 7135
7029again:
7030 spin_lock(&cluster->refill_lock); 7136 spin_lock(&cluster->refill_lock);
7031 if (locked) { 7137 while (1) {
7032 if (used_bg == cluster->block_group) 7138 used_bg = cluster->block_group;
7139 if (!used_bg)
7140 return NULL;
7141
7142 if (used_bg == block_group)
7033 return used_bg; 7143 return used_bg;
7034 7144
7035 up_read(&used_bg->data_rwsem); 7145 btrfs_get_block_group(used_bg);
7036 btrfs_put_block_group(used_bg);
7037 }
7038 7146
7039 used_bg = cluster->block_group; 7147 if (!delalloc)
7040 if (!used_bg) 7148 return used_bg;
7041 return NULL;
7042 7149
7043 if (used_bg == block_group) 7150 if (down_read_trylock(&used_bg->data_rwsem))
7044 return used_bg; 7151 return used_bg;
7045 7152
7046 btrfs_get_block_group(used_bg); 7153 spin_unlock(&cluster->refill_lock);
7047 7154
7048 if (!delalloc) 7155 down_read(&used_bg->data_rwsem);
7049 return used_bg;
7050 7156
7051 if (down_read_trylock(&used_bg->data_rwsem)) 7157 spin_lock(&cluster->refill_lock);
7052 return used_bg; 7158 if (used_bg == cluster->block_group)
7159 return used_bg;
7053 7160
7054 spin_unlock(&cluster->refill_lock); 7161 up_read(&used_bg->data_rwsem);
7055 down_read(&used_bg->data_rwsem); 7162 btrfs_put_block_group(used_bg);
7056 locked = true; 7163 }
7057 goto again;
7058} 7164}
7059 7165
7060static inline void 7166static inline void
@@ -7431,6 +7537,7 @@ checks:
7431 btrfs_add_free_space(block_group, offset, num_bytes); 7537 btrfs_add_free_space(block_group, offset, num_bytes);
7432 goto loop; 7538 goto loop;
7433 } 7539 }
7540 btrfs_inc_block_group_reservations(block_group);
7434 7541
7435 /* we are all good, lets return */ 7542 /* we are all good, lets return */
7436 ins->objectid = search_start; 7543 ins->objectid = search_start;
@@ -7612,8 +7719,10 @@ again:
7612 WARN_ON(num_bytes < root->sectorsize); 7719 WARN_ON(num_bytes < root->sectorsize);
7613 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins, 7720 ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
7614 flags, delalloc); 7721 flags, delalloc);
7615 7722 if (!ret && !is_data) {
7616 if (ret == -ENOSPC) { 7723 btrfs_dec_block_group_reservations(root->fs_info,
7724 ins->objectid);
7725 } else if (ret == -ENOSPC) {
7617 if (!final_tried && ins->offset) { 7726 if (!final_tried && ins->offset) {
7618 num_bytes = min(num_bytes >> 1, ins->offset); 7727 num_bytes = min(num_bytes >> 1, ins->offset);
7619 num_bytes = round_down(num_bytes, root->sectorsize); 7728 num_bytes = round_down(num_bytes, root->sectorsize);
@@ -9058,7 +9167,7 @@ out:
9058 if (!for_reloc && root_dropped == false) 9167 if (!for_reloc && root_dropped == false)
9059 btrfs_add_dead_root(root); 9168 btrfs_add_dead_root(root);
9060 if (err && err != -EAGAIN) 9169 if (err && err != -EAGAIN)
9061 btrfs_std_error(root->fs_info, err, NULL); 9170 btrfs_handle_fs_error(root->fs_info, err, NULL);
9062 return err; 9171 return err;
9063} 9172}
9064 9173
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d247fc0eea19..2f83448d34fe 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3200,14 +3200,10 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
3200 return ret; 3200 return ret;
3201} 3201}
3202 3202
3203static noinline void update_nr_written(struct page *page, 3203static void update_nr_written(struct page *page, struct writeback_control *wbc,
3204 struct writeback_control *wbc, 3204 unsigned long nr_written)
3205 unsigned long nr_written)
3206{ 3205{
3207 wbc->nr_to_write -= nr_written; 3206 wbc->nr_to_write -= nr_written;
3208 if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
3209 wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
3210 page->mapping->writeback_index = page->index + nr_written;
3211} 3207}
3212 3208
3213/* 3209/*
@@ -3368,6 +3364,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3368 3364
3369 while (cur <= end) { 3365 while (cur <= end) {
3370 u64 em_end; 3366 u64 em_end;
3367 unsigned long max_nr;
3368
3371 if (cur >= i_size) { 3369 if (cur >= i_size) {
3372 if (tree->ops && tree->ops->writepage_end_io_hook) 3370 if (tree->ops && tree->ops->writepage_end_io_hook)
3373 tree->ops->writepage_end_io_hook(page, cur, 3371 tree->ops->writepage_end_io_hook(page, cur,
@@ -3423,32 +3421,23 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3423 continue; 3421 continue;
3424 } 3422 }
3425 3423
3426 if (tree->ops && tree->ops->writepage_io_hook) { 3424 max_nr = (i_size >> PAGE_SHIFT) + 1;
3427 ret = tree->ops->writepage_io_hook(page, cur, 3425
3428 cur + iosize - 1); 3426 set_range_writeback(tree, cur, cur + iosize - 1);
3429 } else { 3427 if (!PageWriteback(page)) {
3430 ret = 0; 3428 btrfs_err(BTRFS_I(inode)->root->fs_info,
3429 "page %lu not writeback, cur %llu end %llu",
3430 page->index, cur, end);
3431 } 3431 }
3432 if (ret) {
3433 SetPageError(page);
3434 } else {
3435 unsigned long max_nr = (i_size >> PAGE_SHIFT) + 1;
3436 3432
3437 set_range_writeback(tree, cur, cur + iosize - 1); 3433 ret = submit_extent_page(write_flags, tree, wbc, page,
3438 if (!PageWriteback(page)) { 3434 sector, iosize, pg_offset,
3439 btrfs_err(BTRFS_I(inode)->root->fs_info, 3435 bdev, &epd->bio, max_nr,
3440 "page %lu not writeback, cur %llu end %llu", 3436 end_bio_extent_writepage,
3441 page->index, cur, end); 3437 0, 0, 0, false);
3442 } 3438 if (ret)
3439 SetPageError(page);
3443 3440
3444 ret = submit_extent_page(write_flags, tree, wbc, page,
3445 sector, iosize, pg_offset,
3446 bdev, &epd->bio, max_nr,
3447 end_bio_extent_writepage,
3448 0, 0, 0, false);
3449 if (ret)
3450 SetPageError(page);
3451 }
3452 cur = cur + iosize; 3441 cur = cur + iosize;
3453 pg_offset += iosize; 3442 pg_offset += iosize;
3454 nr++; 3443 nr++;
@@ -3920,12 +3909,13 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
3920 struct inode *inode = mapping->host; 3909 struct inode *inode = mapping->host;
3921 int ret = 0; 3910 int ret = 0;
3922 int done = 0; 3911 int done = 0;
3923 int err = 0;
3924 int nr_to_write_done = 0; 3912 int nr_to_write_done = 0;
3925 struct pagevec pvec; 3913 struct pagevec pvec;
3926 int nr_pages; 3914 int nr_pages;
3927 pgoff_t index; 3915 pgoff_t index;
3928 pgoff_t end; /* Inclusive */ 3916 pgoff_t end; /* Inclusive */
3917 pgoff_t done_index;
3918 int range_whole = 0;
3929 int scanned = 0; 3919 int scanned = 0;
3930 int tag; 3920 int tag;
3931 3921
@@ -3948,6 +3938,8 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
3948 } else { 3938 } else {
3949 index = wbc->range_start >> PAGE_SHIFT; 3939 index = wbc->range_start >> PAGE_SHIFT;
3950 end = wbc->range_end >> PAGE_SHIFT; 3940 end = wbc->range_end >> PAGE_SHIFT;
3941 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
3942 range_whole = 1;
3951 scanned = 1; 3943 scanned = 1;
3952 } 3944 }
3953 if (wbc->sync_mode == WB_SYNC_ALL) 3945 if (wbc->sync_mode == WB_SYNC_ALL)
@@ -3957,6 +3949,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
3957retry: 3949retry:
3958 if (wbc->sync_mode == WB_SYNC_ALL) 3950 if (wbc->sync_mode == WB_SYNC_ALL)
3959 tag_pages_for_writeback(mapping, index, end); 3951 tag_pages_for_writeback(mapping, index, end);
3952 done_index = index;
3960 while (!done && !nr_to_write_done && (index <= end) && 3953 while (!done && !nr_to_write_done && (index <= end) &&
3961 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 3954 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3962 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 3955 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
@@ -3966,6 +3959,7 @@ retry:
3966 for (i = 0; i < nr_pages; i++) { 3959 for (i = 0; i < nr_pages; i++) {
3967 struct page *page = pvec.pages[i]; 3960 struct page *page = pvec.pages[i];
3968 3961
3962 done_index = page->index;
3969 /* 3963 /*
3970 * At this point we hold neither mapping->tree_lock nor 3964 * At this point we hold neither mapping->tree_lock nor
3971 * lock on the page itself: the page may be truncated or 3965 * lock on the page itself: the page may be truncated or
@@ -4007,8 +4001,20 @@ retry:
4007 unlock_page(page); 4001 unlock_page(page);
4008 ret = 0; 4002 ret = 0;
4009 } 4003 }
4010 if (!err && ret < 0) 4004 if (ret < 0) {
4011 err = ret; 4005 /*
4006 * done_index is set past this page,
4007 * so media errors will not choke
4008 * background writeout for the entire
4009 * file. This has consequences for
4010 * range_cyclic semantics (ie. it may
4011 * not be suitable for data integrity
4012 * writeout).
4013 */
4014 done_index = page->index + 1;
4015 done = 1;
4016 break;
4017 }
4012 4018
4013 /* 4019 /*
4014 * the filesystem may choose to bump up nr_to_write. 4020 * the filesystem may choose to bump up nr_to_write.
@@ -4020,7 +4026,7 @@ retry:
4020 pagevec_release(&pvec); 4026 pagevec_release(&pvec);
4021 cond_resched(); 4027 cond_resched();
4022 } 4028 }
4023 if (!scanned && !done && !err) { 4029 if (!scanned && !done) {
4024 /* 4030 /*
4025 * We hit the last page and there is more work to be done: wrap 4031 * We hit the last page and there is more work to be done: wrap
4026 * back to the start of the file 4032 * back to the start of the file
@@ -4029,8 +4035,12 @@ retry:
4029 index = 0; 4035 index = 0;
4030 goto retry; 4036 goto retry;
4031 } 4037 }
4038
4039 if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
4040 mapping->writeback_index = done_index;
4041
4032 btrfs_add_delayed_iput(inode); 4042 btrfs_add_delayed_iput(inode);
4033 return err; 4043 return ret;
4034} 4044}
4035 4045
4036static void flush_epd_write_bio(struct extent_page_data *epd) 4046static void flush_epd_write_bio(struct extent_page_data *epd)
@@ -4822,7 +4832,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
4822 return NULL; 4832 return NULL;
4823 eb->fs_info = fs_info; 4833 eb->fs_info = fs_info;
4824again: 4834again:
4825 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4835 ret = radix_tree_preload(GFP_NOFS);
4826 if (ret) 4836 if (ret)
4827 goto free_eb; 4837 goto free_eb;
4828 spin_lock(&fs_info->buffer_lock); 4838 spin_lock(&fs_info->buffer_lock);
@@ -4923,7 +4933,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
4923 if (uptodate) 4933 if (uptodate)
4924 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 4934 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4925again: 4935again:
4926 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 4936 ret = radix_tree_preload(GFP_NOFS);
4927 if (ret) 4937 if (ret)
4928 goto free_eb; 4938 goto free_eb;
4929 4939
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b5e0ade90e88..981f402bf754 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -71,7 +71,6 @@ struct extent_io_ops {
71 u64 start, u64 end, int *page_started, 71 u64 start, u64 end, int *page_started,
72 unsigned long *nr_written); 72 unsigned long *nr_written);
73 int (*writepage_start_hook)(struct page *page, u64 start, u64 end); 73 int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
74 int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
75 extent_submit_bio_hook_t *submit_bio_hook; 74 extent_submit_bio_hook_t *submit_bio_hook;
76 int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset, 75 int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
77 size_t size, struct bio *bio, 76 size_t size, struct bio *bio,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index ea9f10bb089c..c98805c35bab 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1696,7 +1696,9 @@ again:
1696 btrfs_end_write_no_snapshoting(root); 1696 btrfs_end_write_no_snapshoting(root);
1697 btrfs_delalloc_release_metadata(inode, release_bytes); 1697 btrfs_delalloc_release_metadata(inode, release_bytes);
1698 } else { 1698 } else {
1699 btrfs_delalloc_release_space(inode, pos, release_bytes); 1699 btrfs_delalloc_release_space(inode,
1700 round_down(pos, root->sectorsize),
1701 release_bytes);
1700 } 1702 }
1701 } 1703 }
1702 1704
@@ -2952,7 +2954,7 @@ const struct file_operations btrfs_file_operations = {
2952 .fallocate = btrfs_fallocate, 2954 .fallocate = btrfs_fallocate,
2953 .unlocked_ioctl = btrfs_ioctl, 2955 .unlocked_ioctl = btrfs_ioctl,
2954#ifdef CONFIG_COMPAT 2956#ifdef CONFIG_COMPAT
2955 .compat_ioctl = btrfs_ioctl, 2957 .compat_ioctl = btrfs_compat_ioctl,
2956#endif 2958#endif
2957 .copy_file_range = btrfs_copy_file_range, 2959 .copy_file_range = btrfs_copy_file_range,
2958 .clone_file_range = btrfs_clone_file_range, 2960 .clone_file_range = btrfs_clone_file_range,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index be4d22a5022f..b8acc07ac6c2 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -157,7 +157,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
157 */ 157 */
158 if (!btrfs_find_name_in_ext_backref(path, ref_objectid, 158 if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
159 name, name_len, &extref)) { 159 name, name_len, &extref)) {
160 btrfs_std_error(root->fs_info, -ENOENT, NULL); 160 btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
161 ret = -EROFS; 161 ret = -EROFS;
162 goto out; 162 goto out;
163 } 163 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6b7fe291a174..91419ef79b00 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -824,6 +824,7 @@ retry:
824 async_extent->ram_size - 1, 0); 824 async_extent->ram_size - 1, 0);
825 goto out_free_reserve; 825 goto out_free_reserve;
826 } 826 }
827 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
827 828
828 /* 829 /*
829 * clear dirty, set writeback and unlock the pages. 830 * clear dirty, set writeback and unlock the pages.
@@ -861,6 +862,7 @@ retry:
861 } 862 }
862 return; 863 return;
863out_free_reserve: 864out_free_reserve:
865 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
864 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 866 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
865out_free: 867out_free:
866 extent_clear_unlock_delalloc(inode, async_extent->start, 868 extent_clear_unlock_delalloc(inode, async_extent->start,
@@ -1038,6 +1040,8 @@ static noinline int cow_file_range(struct inode *inode,
1038 goto out_drop_extent_cache; 1040 goto out_drop_extent_cache;
1039 } 1041 }
1040 1042
1043 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1044
1041 if (disk_num_bytes < cur_alloc_size) 1045 if (disk_num_bytes < cur_alloc_size)
1042 break; 1046 break;
1043 1047
@@ -1066,6 +1070,7 @@ out:
1066out_drop_extent_cache: 1070out_drop_extent_cache:
1067 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); 1071 btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
1068out_reserve: 1072out_reserve:
1073 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
1069 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 1074 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
1070out_unlock: 1075out_unlock:
1071 extent_clear_unlock_delalloc(inode, start, end, locked_page, 1076 extent_clear_unlock_delalloc(inode, start, end, locked_page,
@@ -1377,6 +1382,9 @@ next_slot:
1377 */ 1382 */
1378 if (csum_exist_in_range(root, disk_bytenr, num_bytes)) 1383 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1379 goto out_check; 1384 goto out_check;
1385 if (!btrfs_inc_nocow_writers(root->fs_info,
1386 disk_bytenr))
1387 goto out_check;
1380 nocow = 1; 1388 nocow = 1;
1381 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { 1389 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1382 extent_end = found_key.offset + 1390 extent_end = found_key.offset +
@@ -1391,6 +1399,9 @@ out_check:
1391 path->slots[0]++; 1399 path->slots[0]++;
1392 if (!nolock && nocow) 1400 if (!nolock && nocow)
1393 btrfs_end_write_no_snapshoting(root); 1401 btrfs_end_write_no_snapshoting(root);
1402 if (nocow)
1403 btrfs_dec_nocow_writers(root->fs_info,
1404 disk_bytenr);
1394 goto next_slot; 1405 goto next_slot;
1395 } 1406 }
1396 if (!nocow) { 1407 if (!nocow) {
@@ -1411,6 +1422,9 @@ out_check:
1411 if (ret) { 1422 if (ret) {
1412 if (!nolock && nocow) 1423 if (!nolock && nocow)
1413 btrfs_end_write_no_snapshoting(root); 1424 btrfs_end_write_no_snapshoting(root);
1425 if (nocow)
1426 btrfs_dec_nocow_writers(root->fs_info,
1427 disk_bytenr);
1414 goto error; 1428 goto error;
1415 } 1429 }
1416 cow_start = (u64)-1; 1430 cow_start = (u64)-1;
@@ -1453,6 +1467,8 @@ out_check:
1453 1467
1454 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, 1468 ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1455 num_bytes, num_bytes, type); 1469 num_bytes, num_bytes, type);
1470 if (nocow)
1471 btrfs_dec_nocow_writers(root->fs_info, disk_bytenr);
1456 BUG_ON(ret); /* -ENOMEM */ 1472 BUG_ON(ret); /* -ENOMEM */
1457 1473
1458 if (root->root_key.objectid == 1474 if (root->root_key.objectid ==
@@ -7129,6 +7145,43 @@ out:
7129 return em; 7145 return em;
7130} 7146}
7131 7147
7148static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7149 const u64 start,
7150 const u64 len,
7151 const u64 orig_start,
7152 const u64 block_start,
7153 const u64 block_len,
7154 const u64 orig_block_len,
7155 const u64 ram_bytes,
7156 const int type)
7157{
7158 struct extent_map *em = NULL;
7159 int ret;
7160
7161 down_read(&BTRFS_I(inode)->dio_sem);
7162 if (type != BTRFS_ORDERED_NOCOW) {
7163 em = create_pinned_em(inode, start, len, orig_start,
7164 block_start, block_len, orig_block_len,
7165 ram_bytes, type);
7166 if (IS_ERR(em))
7167 goto out;
7168 }
7169 ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7170 len, block_len, type);
7171 if (ret) {
7172 if (em) {
7173 free_extent_map(em);
7174 btrfs_drop_extent_cache(inode, start,
7175 start + len - 1, 0);
7176 }
7177 em = ERR_PTR(ret);
7178 }
7179 out:
7180 up_read(&BTRFS_I(inode)->dio_sem);
7181
7182 return em;
7183}
7184
7132static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 7185static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7133 u64 start, u64 len) 7186 u64 start, u64 len)
7134{ 7187{
@@ -7144,41 +7197,13 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7144 if (ret) 7197 if (ret)
7145 return ERR_PTR(ret); 7198 return ERR_PTR(ret);
7146 7199
7147 /* 7200 em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7148 * Create the ordered extent before the extent map. This is to avoid 7201 ins.objectid, ins.offset, ins.offset,
7149 * races with the fast fsync path that would lead to it logging file 7202 ins.offset, 0);
7150 * extent items that point to disk extents that were not yet written to. 7203 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
7151 * The fast fsync path collects ordered extents into a local list and 7204 if (IS_ERR(em))
7152 * then collects all the new extent maps, so we must create the ordered
7153 * extent first and make sure the fast fsync path collects any new
7154 * ordered extents after collecting new extent maps as well.
7155 * The fsync path simply can not rely on inode_dio_wait() because it
7156 * causes deadlock with AIO.
7157 */
7158 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
7159 ins.offset, ins.offset, 0);
7160 if (ret) {
7161 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); 7205 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7162 return ERR_PTR(ret);
7163 }
7164
7165 em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
7166 ins.offset, ins.offset, ins.offset, 0);
7167 if (IS_ERR(em)) {
7168 struct btrfs_ordered_extent *oe;
7169 7206
7170 btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
7171 oe = btrfs_lookup_ordered_extent(inode, start);
7172 ASSERT(oe);
7173 if (WARN_ON(!oe))
7174 return em;
7175 set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
7176 set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
7177 btrfs_remove_ordered_extent(inode, oe);
7178 /* Once for our lookup and once for the ordered extents tree. */
7179 btrfs_put_ordered_extent(oe);
7180 btrfs_put_ordered_extent(oe);
7181 }
7182 return em; 7207 return em;
7183} 7208}
7184 7209
@@ -7650,24 +7675,21 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7650 block_start = em->block_start + (start - em->start); 7675 block_start = em->block_start + (start - em->start);
7651 7676
7652 if (can_nocow_extent(inode, start, &len, &orig_start, 7677 if (can_nocow_extent(inode, start, &len, &orig_start,
7653 &orig_block_len, &ram_bytes) == 1) { 7678 &orig_block_len, &ram_bytes) == 1 &&
7679 btrfs_inc_nocow_writers(root->fs_info, block_start)) {
7680 struct extent_map *em2;
7681
7682 em2 = btrfs_create_dio_extent(inode, start, len,
7683 orig_start, block_start,
7684 len, orig_block_len,
7685 ram_bytes, type);
7686 btrfs_dec_nocow_writers(root->fs_info, block_start);
7654 if (type == BTRFS_ORDERED_PREALLOC) { 7687 if (type == BTRFS_ORDERED_PREALLOC) {
7655 free_extent_map(em); 7688 free_extent_map(em);
7656 em = create_pinned_em(inode, start, len, 7689 em = em2;
7657 orig_start,
7658 block_start, len,
7659 orig_block_len,
7660 ram_bytes, type);
7661 if (IS_ERR(em)) {
7662 ret = PTR_ERR(em);
7663 goto unlock_err;
7664 }
7665 } 7690 }
7666 7691 if (em2 && IS_ERR(em2)) {
7667 ret = btrfs_add_ordered_extent_dio(inode, start, 7692 ret = PTR_ERR(em2);
7668 block_start, len, len, type);
7669 if (ret) {
7670 free_extent_map(em);
7671 goto unlock_err; 7693 goto unlock_err;
7672 } 7694 }
7673 goto unlock; 7695 goto unlock;
@@ -9230,6 +9252,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
9230 INIT_LIST_HEAD(&ei->delalloc_inodes); 9252 INIT_LIST_HEAD(&ei->delalloc_inodes);
9231 INIT_LIST_HEAD(&ei->delayed_iput); 9253 INIT_LIST_HEAD(&ei->delayed_iput);
9232 RB_CLEAR_NODE(&ei->rb_node); 9254 RB_CLEAR_NODE(&ei->rb_node);
9255 init_rwsem(&ei->dio_sem);
9233 9256
9234 return inode; 9257 return inode;
9235} 9258}
@@ -9387,10 +9410,281 @@ static int btrfs_getattr(struct vfsmount *mnt,
9387 return 0; 9410 return 0;
9388} 9411}
9389 9412
9413static int btrfs_rename_exchange(struct inode *old_dir,
9414 struct dentry *old_dentry,
9415 struct inode *new_dir,
9416 struct dentry *new_dentry)
9417{
9418 struct btrfs_trans_handle *trans;
9419 struct btrfs_root *root = BTRFS_I(old_dir)->root;
9420 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9421 struct inode *new_inode = new_dentry->d_inode;
9422 struct inode *old_inode = old_dentry->d_inode;
9423 struct timespec ctime = CURRENT_TIME;
9424 struct dentry *parent;
9425 u64 old_ino = btrfs_ino(old_inode);
9426 u64 new_ino = btrfs_ino(new_inode);
9427 u64 old_idx = 0;
9428 u64 new_idx = 0;
9429 u64 root_objectid;
9430 int ret;
9431 bool root_log_pinned = false;
9432 bool dest_log_pinned = false;
9433
9434 /* we only allow rename subvolume link between subvolumes */
9435 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9436 return -EXDEV;
9437
9438 /* close the race window with snapshot create/destroy ioctl */
9439 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9440 down_read(&root->fs_info->subvol_sem);
9441 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9442 down_read(&dest->fs_info->subvol_sem);
9443
9444 /*
9445 * We want to reserve the absolute worst case amount of items. So if
9446 * both inodes are subvols and we need to unlink them then that would
9447 * require 4 item modifications, but if they are both normal inodes it
9448 * would require 5 item modifications, so we'll assume their normal
9449 * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9450 * should cover the worst case number of items we'll modify.
9451 */
9452 trans = btrfs_start_transaction(root, 12);
9453 if (IS_ERR(trans)) {
9454 ret = PTR_ERR(trans);
9455 goto out_notrans;
9456 }
9457
9458 /*
9459 * We need to find a free sequence number both in the source and
9460 * in the destination directory for the exchange.
9461 */
9462 ret = btrfs_set_inode_index(new_dir, &old_idx);
9463 if (ret)
9464 goto out_fail;
9465 ret = btrfs_set_inode_index(old_dir, &new_idx);
9466 if (ret)
9467 goto out_fail;
9468
9469 BTRFS_I(old_inode)->dir_index = 0ULL;
9470 BTRFS_I(new_inode)->dir_index = 0ULL;
9471
9472 /* Reference for the source. */
9473 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9474 /* force full log commit if subvolume involved. */
9475 btrfs_set_log_full_commit(root->fs_info, trans);
9476 } else {
9477 btrfs_pin_log_trans(root);
9478 root_log_pinned = true;
9479 ret = btrfs_insert_inode_ref(trans, dest,
9480 new_dentry->d_name.name,
9481 new_dentry->d_name.len,
9482 old_ino,
9483 btrfs_ino(new_dir), old_idx);
9484 if (ret)
9485 goto out_fail;
9486 }
9487
9488 /* And now for the dest. */
9489 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9490 /* force full log commit if subvolume involved. */
9491 btrfs_set_log_full_commit(dest->fs_info, trans);
9492 } else {
9493 btrfs_pin_log_trans(dest);
9494 dest_log_pinned = true;
9495 ret = btrfs_insert_inode_ref(trans, root,
9496 old_dentry->d_name.name,
9497 old_dentry->d_name.len,
9498 new_ino,
9499 btrfs_ino(old_dir), new_idx);
9500 if (ret)
9501 goto out_fail;
9502 }
9503
9504 /* Update inode version and ctime/mtime. */
9505 inode_inc_iversion(old_dir);
9506 inode_inc_iversion(new_dir);
9507 inode_inc_iversion(old_inode);
9508 inode_inc_iversion(new_inode);
9509 old_dir->i_ctime = old_dir->i_mtime = ctime;
9510 new_dir->i_ctime = new_dir->i_mtime = ctime;
9511 old_inode->i_ctime = ctime;
9512 new_inode->i_ctime = ctime;
9513
9514 if (old_dentry->d_parent != new_dentry->d_parent) {
9515 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
9516 btrfs_record_unlink_dir(trans, new_dir, new_inode, 1);
9517 }
9518
9519 /* src is a subvolume */
9520 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9521 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9522 ret = btrfs_unlink_subvol(trans, root, old_dir,
9523 root_objectid,
9524 old_dentry->d_name.name,
9525 old_dentry->d_name.len);
9526 } else { /* src is an inode */
9527 ret = __btrfs_unlink_inode(trans, root, old_dir,
9528 old_dentry->d_inode,
9529 old_dentry->d_name.name,
9530 old_dentry->d_name.len);
9531 if (!ret)
9532 ret = btrfs_update_inode(trans, root, old_inode);
9533 }
9534 if (ret) {
9535 btrfs_abort_transaction(trans, root, ret);
9536 goto out_fail;
9537 }
9538
9539 /* dest is a subvolume */
9540 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9541 root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
9542 ret = btrfs_unlink_subvol(trans, dest, new_dir,
9543 root_objectid,
9544 new_dentry->d_name.name,
9545 new_dentry->d_name.len);
9546 } else { /* dest is an inode */
9547 ret = __btrfs_unlink_inode(trans, dest, new_dir,
9548 new_dentry->d_inode,
9549 new_dentry->d_name.name,
9550 new_dentry->d_name.len);
9551 if (!ret)
9552 ret = btrfs_update_inode(trans, dest, new_inode);
9553 }
9554 if (ret) {
9555 btrfs_abort_transaction(trans, root, ret);
9556 goto out_fail;
9557 }
9558
9559 ret = btrfs_add_link(trans, new_dir, old_inode,
9560 new_dentry->d_name.name,
9561 new_dentry->d_name.len, 0, old_idx);
9562 if (ret) {
9563 btrfs_abort_transaction(trans, root, ret);
9564 goto out_fail;
9565 }
9566
9567 ret = btrfs_add_link(trans, old_dir, new_inode,
9568 old_dentry->d_name.name,
9569 old_dentry->d_name.len, 0, new_idx);
9570 if (ret) {
9571 btrfs_abort_transaction(trans, root, ret);
9572 goto out_fail;
9573 }
9574
9575 if (old_inode->i_nlink == 1)
9576 BTRFS_I(old_inode)->dir_index = old_idx;
9577 if (new_inode->i_nlink == 1)
9578 BTRFS_I(new_inode)->dir_index = new_idx;
9579
9580 if (root_log_pinned) {
9581 parent = new_dentry->d_parent;
9582 btrfs_log_new_name(trans, old_inode, old_dir, parent);
9583 btrfs_end_log_trans(root);
9584 root_log_pinned = false;
9585 }
9586 if (dest_log_pinned) {
9587 parent = old_dentry->d_parent;
9588 btrfs_log_new_name(trans, new_inode, new_dir, parent);
9589 btrfs_end_log_trans(dest);
9590 dest_log_pinned = false;
9591 }
9592out_fail:
9593 /*
9594 * If we have pinned a log and an error happened, we unpin tasks
9595 * trying to sync the log and force them to fallback to a transaction
9596 * commit if the log currently contains any of the inodes involved in
9597 * this rename operation (to ensure we do not persist a log with an
9598 * inconsistent state for any of these inodes or leading to any
9599 * inconsistencies when replayed). If the transaction was aborted, the
9600 * abortion reason is propagated to userspace when attempting to commit
9601 * the transaction. If the log does not contain any of these inodes, we
9602 * allow the tasks to sync it.
9603 */
9604 if (ret && (root_log_pinned || dest_log_pinned)) {
9605 if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
9606 btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
9607 btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
9608 (new_inode &&
9609 btrfs_inode_in_log(new_inode, root->fs_info->generation)))
9610 btrfs_set_log_full_commit(root->fs_info, trans);
9611
9612 if (root_log_pinned) {
9613 btrfs_end_log_trans(root);
9614 root_log_pinned = false;
9615 }
9616 if (dest_log_pinned) {
9617 btrfs_end_log_trans(dest);
9618 dest_log_pinned = false;
9619 }
9620 }
9621 ret = btrfs_end_transaction(trans, root);
9622out_notrans:
9623 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9624 up_read(&dest->fs_info->subvol_sem);
9625 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9626 up_read(&root->fs_info->subvol_sem);
9627
9628 return ret;
9629}
9630
9631static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9632 struct btrfs_root *root,
9633 struct inode *dir,
9634 struct dentry *dentry)
9635{
9636 int ret;
9637 struct inode *inode;
9638 u64 objectid;
9639 u64 index;
9640
9641 ret = btrfs_find_free_ino(root, &objectid);
9642 if (ret)
9643 return ret;
9644
9645 inode = btrfs_new_inode(trans, root, dir,
9646 dentry->d_name.name,
9647 dentry->d_name.len,
9648 btrfs_ino(dir),
9649 objectid,
9650 S_IFCHR | WHITEOUT_MODE,
9651 &index);
9652
9653 if (IS_ERR(inode)) {
9654 ret = PTR_ERR(inode);
9655 return ret;
9656 }
9657
9658 inode->i_op = &btrfs_special_inode_operations;
9659 init_special_inode(inode, inode->i_mode,
9660 WHITEOUT_DEV);
9661
9662 ret = btrfs_init_inode_security(trans, inode, dir,
9663 &dentry->d_name);
9664 if (ret)
9665 goto out;
9666
9667 ret = btrfs_add_nondir(trans, dir, dentry,
9668 inode, 0, index);
9669 if (ret)
9670 goto out;
9671
9672 ret = btrfs_update_inode(trans, root, inode);
9673out:
9674 unlock_new_inode(inode);
9675 if (ret)
9676 inode_dec_link_count(inode);
9677 iput(inode);
9678
9679 return ret;
9680}
9681
9390static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 9682static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9391 struct inode *new_dir, struct dentry *new_dentry) 9683 struct inode *new_dir, struct dentry *new_dentry,
9684 unsigned int flags)
9392{ 9685{
9393 struct btrfs_trans_handle *trans; 9686 struct btrfs_trans_handle *trans;
9687 unsigned int trans_num_items;
9394 struct btrfs_root *root = BTRFS_I(old_dir)->root; 9688 struct btrfs_root *root = BTRFS_I(old_dir)->root;
9395 struct btrfs_root *dest = BTRFS_I(new_dir)->root; 9689 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9396 struct inode *new_inode = d_inode(new_dentry); 9690 struct inode *new_inode = d_inode(new_dentry);
@@ -9399,6 +9693,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9399 u64 root_objectid; 9693 u64 root_objectid;
9400 int ret; 9694 int ret;
9401 u64 old_ino = btrfs_ino(old_inode); 9695 u64 old_ino = btrfs_ino(old_inode);
9696 bool log_pinned = false;
9402 9697
9403 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 9698 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9404 return -EPERM; 9699 return -EPERM;
@@ -9449,15 +9744,21 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9449 * We want to reserve the absolute worst case amount of items. So if 9744 * We want to reserve the absolute worst case amount of items. So if
9450 * both inodes are subvols and we need to unlink them then that would 9745 * both inodes are subvols and we need to unlink them then that would
9451 * require 4 item modifications, but if they are both normal inodes it 9746 * require 4 item modifications, but if they are both normal inodes it
9452 * would require 5 item modifications, so we'll assume their normal 9747 * would require 5 item modifications, so we'll assume they are normal
9453 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items 9748 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9454 * should cover the worst case number of items we'll modify. 9749 * should cover the worst case number of items we'll modify.
9750 * If our rename has the whiteout flag, we need more 5 units for the
9751 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9752 * when selinux is enabled).
9455 */ 9753 */
9456 trans = btrfs_start_transaction(root, 11); 9754 trans_num_items = 11;
9755 if (flags & RENAME_WHITEOUT)
9756 trans_num_items += 5;
9757 trans = btrfs_start_transaction(root, trans_num_items);
9457 if (IS_ERR(trans)) { 9758 if (IS_ERR(trans)) {
9458 ret = PTR_ERR(trans); 9759 ret = PTR_ERR(trans);
9459 goto out_notrans; 9760 goto out_notrans;
9460 } 9761 }
9461 9762
9462 if (dest != root) 9763 if (dest != root)
9463 btrfs_record_root_in_trans(trans, dest); 9764 btrfs_record_root_in_trans(trans, dest);
@@ -9471,6 +9772,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9471 /* force full log commit if subvolume involved. */ 9772 /* force full log commit if subvolume involved. */
9472 btrfs_set_log_full_commit(root->fs_info, trans); 9773 btrfs_set_log_full_commit(root->fs_info, trans);
9473 } else { 9774 } else {
9775 btrfs_pin_log_trans(root);
9776 log_pinned = true;
9474 ret = btrfs_insert_inode_ref(trans, dest, 9777 ret = btrfs_insert_inode_ref(trans, dest,
9475 new_dentry->d_name.name, 9778 new_dentry->d_name.name,
9476 new_dentry->d_name.len, 9779 new_dentry->d_name.len,
@@ -9478,14 +9781,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9478 btrfs_ino(new_dir), index); 9781 btrfs_ino(new_dir), index);
9479 if (ret) 9782 if (ret)
9480 goto out_fail; 9783 goto out_fail;
9481 /*
9482 * this is an ugly little race, but the rename is required
9483 * to make sure that if we crash, the inode is either at the
9484 * old name or the new one. pinning the log transaction lets
9485 * us make sure we don't allow a log commit to come in after
9486 * we unlink the name but before we add the new name back in.
9487 */
9488 btrfs_pin_log_trans(root);
9489 } 9784 }
9490 9785
9491 inode_inc_iversion(old_dir); 9786 inode_inc_iversion(old_dir);
@@ -9552,12 +9847,46 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9552 if (old_inode->i_nlink == 1) 9847 if (old_inode->i_nlink == 1)
9553 BTRFS_I(old_inode)->dir_index = index; 9848 BTRFS_I(old_inode)->dir_index = index;
9554 9849
9555 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 9850 if (log_pinned) {
9556 struct dentry *parent = new_dentry->d_parent; 9851 struct dentry *parent = new_dentry->d_parent;
9852
9557 btrfs_log_new_name(trans, old_inode, old_dir, parent); 9853 btrfs_log_new_name(trans, old_inode, old_dir, parent);
9558 btrfs_end_log_trans(root); 9854 btrfs_end_log_trans(root);
9855 log_pinned = false;
9856 }
9857
9858 if (flags & RENAME_WHITEOUT) {
9859 ret = btrfs_whiteout_for_rename(trans, root, old_dir,
9860 old_dentry);
9861
9862 if (ret) {
9863 btrfs_abort_transaction(trans, root, ret);
9864 goto out_fail;
9865 }
9559 } 9866 }
9560out_fail: 9867out_fail:
9868 /*
9869 * If we have pinned the log and an error happened, we unpin tasks
9870 * trying to sync the log and force them to fallback to a transaction
9871 * commit if the log currently contains any of the inodes involved in
9872 * this rename operation (to ensure we do not persist a log with an
9873 * inconsistent state for any of these inodes or leading to any
9874 * inconsistencies when replayed). If the transaction was aborted, the
9875 * abortion reason is propagated to userspace when attempting to commit
9876 * the transaction. If the log does not contain any of these inodes, we
9877 * allow the tasks to sync it.
9878 */
9879 if (ret && log_pinned) {
9880 if (btrfs_inode_in_log(old_dir, root->fs_info->generation) ||
9881 btrfs_inode_in_log(new_dir, root->fs_info->generation) ||
9882 btrfs_inode_in_log(old_inode, root->fs_info->generation) ||
9883 (new_inode &&
9884 btrfs_inode_in_log(new_inode, root->fs_info->generation)))
9885 btrfs_set_log_full_commit(root->fs_info, trans);
9886
9887 btrfs_end_log_trans(root);
9888 log_pinned = false;
9889 }
9561 btrfs_end_transaction(trans, root); 9890 btrfs_end_transaction(trans, root);
9562out_notrans: 9891out_notrans:
9563 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) 9892 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9570,10 +9899,14 @@ static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
9570 struct inode *new_dir, struct dentry *new_dentry, 9899 struct inode *new_dir, struct dentry *new_dentry,
9571 unsigned int flags) 9900 unsigned int flags)
9572{ 9901{
9573 if (flags & ~RENAME_NOREPLACE) 9902 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9574 return -EINVAL; 9903 return -EINVAL;
9575 9904
9576 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry); 9905 if (flags & RENAME_EXCHANGE)
9906 return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9907 new_dentry);
9908
9909 return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
9577} 9910}
9578 9911
9579static void btrfs_run_delalloc_work(struct btrfs_work *work) 9912static void btrfs_run_delalloc_work(struct btrfs_work *work)
@@ -9942,6 +10275,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9942 btrfs_end_transaction(trans, root); 10275 btrfs_end_transaction(trans, root);
9943 break; 10276 break;
9944 } 10277 }
10278 btrfs_dec_block_group_reservations(root->fs_info, ins.objectid);
9945 10279
9946 last_alloc = ins.offset; 10280 last_alloc = ins.offset;
9947 ret = insert_reserved_file_extent(trans, inode, 10281 ret = insert_reserved_file_extent(trans, inode,
@@ -10184,7 +10518,7 @@ static const struct file_operations btrfs_dir_file_operations = {
10184 .iterate = btrfs_real_readdir, 10518 .iterate = btrfs_real_readdir,
10185 .unlocked_ioctl = btrfs_ioctl, 10519 .unlocked_ioctl = btrfs_ioctl,
10186#ifdef CONFIG_COMPAT 10520#ifdef CONFIG_COMPAT
10187 .compat_ioctl = btrfs_ioctl, 10521 .compat_ioctl = btrfs_compat_ioctl,
10188#endif 10522#endif
10189 .release = btrfs_release_file, 10523 .release = btrfs_release_file,
10190 .fsync = btrfs_sync_file, 10524 .fsync = btrfs_sync_file,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0b8ba717175b..4e700694b741 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -125,10 +125,10 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
125 if (flags & BTRFS_INODE_NODATACOW) 125 if (flags & BTRFS_INODE_NODATACOW)
126 iflags |= FS_NOCOW_FL; 126 iflags |= FS_NOCOW_FL;
127 127
128 if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS)) 128 if (flags & BTRFS_INODE_NOCOMPRESS)
129 iflags |= FS_COMPR_FL;
130 else if (flags & BTRFS_INODE_NOCOMPRESS)
131 iflags |= FS_NOCOMP_FL; 129 iflags |= FS_NOCOMP_FL;
130 else if (flags & BTRFS_INODE_COMPRESS)
131 iflags |= FS_COMPR_FL;
132 132
133 return iflags; 133 return iflags;
134} 134}
@@ -439,7 +439,7 @@ static noinline int create_subvol(struct inode *dir,
439{ 439{
440 struct btrfs_trans_handle *trans; 440 struct btrfs_trans_handle *trans;
441 struct btrfs_key key; 441 struct btrfs_key key;
442 struct btrfs_root_item root_item; 442 struct btrfs_root_item *root_item;
443 struct btrfs_inode_item *inode_item; 443 struct btrfs_inode_item *inode_item;
444 struct extent_buffer *leaf; 444 struct extent_buffer *leaf;
445 struct btrfs_root *root = BTRFS_I(dir)->root; 445 struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -455,16 +455,22 @@ static noinline int create_subvol(struct inode *dir,
455 u64 qgroup_reserved; 455 u64 qgroup_reserved;
456 uuid_le new_uuid; 456 uuid_le new_uuid;
457 457
458 root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
459 if (!root_item)
460 return -ENOMEM;
461
458 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 462 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
459 if (ret) 463 if (ret)
460 return ret; 464 goto fail_free;
461 465
462 /* 466 /*
463 * Don't create subvolume whose level is not zero. Or qgroup will be 467 * Don't create subvolume whose level is not zero. Or qgroup will be
464 * screwed up since it assume subvolme qgroup's level to be 0. 468 * screwed up since it assume subvolme qgroup's level to be 0.
465 */ 469 */
466 if (btrfs_qgroup_level(objectid)) 470 if (btrfs_qgroup_level(objectid)) {
467 return -ENOSPC; 471 ret = -ENOSPC;
472 goto fail_free;
473 }
468 474
469 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); 475 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
470 /* 476 /*
@@ -474,14 +480,14 @@ static noinline int create_subvol(struct inode *dir,
474 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 480 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
475 8, &qgroup_reserved, false); 481 8, &qgroup_reserved, false);
476 if (ret) 482 if (ret)
477 return ret; 483 goto fail_free;
478 484
479 trans = btrfs_start_transaction(root, 0); 485 trans = btrfs_start_transaction(root, 0);
480 if (IS_ERR(trans)) { 486 if (IS_ERR(trans)) {
481 ret = PTR_ERR(trans); 487 ret = PTR_ERR(trans);
482 btrfs_subvolume_release_metadata(root, &block_rsv, 488 btrfs_subvolume_release_metadata(root, &block_rsv,
483 qgroup_reserved); 489 qgroup_reserved);
484 return ret; 490 goto fail_free;
485 } 491 }
486 trans->block_rsv = &block_rsv; 492 trans->block_rsv = &block_rsv;
487 trans->bytes_reserved = block_rsv.size; 493 trans->bytes_reserved = block_rsv.size;
@@ -509,47 +515,45 @@ static noinline int create_subvol(struct inode *dir,
509 BTRFS_UUID_SIZE); 515 BTRFS_UUID_SIZE);
510 btrfs_mark_buffer_dirty(leaf); 516 btrfs_mark_buffer_dirty(leaf);
511 517
512 memset(&root_item, 0, sizeof(root_item)); 518 inode_item = &root_item->inode;
513
514 inode_item = &root_item.inode;
515 btrfs_set_stack_inode_generation(inode_item, 1); 519 btrfs_set_stack_inode_generation(inode_item, 1);
516 btrfs_set_stack_inode_size(inode_item, 3); 520 btrfs_set_stack_inode_size(inode_item, 3);
517 btrfs_set_stack_inode_nlink(inode_item, 1); 521 btrfs_set_stack_inode_nlink(inode_item, 1);
518 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); 522 btrfs_set_stack_inode_nbytes(inode_item, root->nodesize);
519 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); 523 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
520 524
521 btrfs_set_root_flags(&root_item, 0); 525 btrfs_set_root_flags(root_item, 0);
522 btrfs_set_root_limit(&root_item, 0); 526 btrfs_set_root_limit(root_item, 0);
523 btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT); 527 btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
524 528
525 btrfs_set_root_bytenr(&root_item, leaf->start); 529 btrfs_set_root_bytenr(root_item, leaf->start);
526 btrfs_set_root_generation(&root_item, trans->transid); 530 btrfs_set_root_generation(root_item, trans->transid);
527 btrfs_set_root_level(&root_item, 0); 531 btrfs_set_root_level(root_item, 0);
528 btrfs_set_root_refs(&root_item, 1); 532 btrfs_set_root_refs(root_item, 1);
529 btrfs_set_root_used(&root_item, leaf->len); 533 btrfs_set_root_used(root_item, leaf->len);
530 btrfs_set_root_last_snapshot(&root_item, 0); 534 btrfs_set_root_last_snapshot(root_item, 0);
531 535
532 btrfs_set_root_generation_v2(&root_item, 536 btrfs_set_root_generation_v2(root_item,
533 btrfs_root_generation(&root_item)); 537 btrfs_root_generation(root_item));
534 uuid_le_gen(&new_uuid); 538 uuid_le_gen(&new_uuid);
535 memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE); 539 memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
536 btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec); 540 btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
537 btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec); 541 btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
538 root_item.ctime = root_item.otime; 542 root_item->ctime = root_item->otime;
539 btrfs_set_root_ctransid(&root_item, trans->transid); 543 btrfs_set_root_ctransid(root_item, trans->transid);
540 btrfs_set_root_otransid(&root_item, trans->transid); 544 btrfs_set_root_otransid(root_item, trans->transid);
541 545
542 btrfs_tree_unlock(leaf); 546 btrfs_tree_unlock(leaf);
543 free_extent_buffer(leaf); 547 free_extent_buffer(leaf);
544 leaf = NULL; 548 leaf = NULL;
545 549
546 btrfs_set_root_dirid(&root_item, new_dirid); 550 btrfs_set_root_dirid(root_item, new_dirid);
547 551
548 key.objectid = objectid; 552 key.objectid = objectid;
549 key.offset = 0; 553 key.offset = 0;
550 key.type = BTRFS_ROOT_ITEM_KEY; 554 key.type = BTRFS_ROOT_ITEM_KEY;
551 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 555 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
552 &root_item); 556 root_item);
553 if (ret) 557 if (ret)
554 goto fail; 558 goto fail;
555 559
@@ -601,12 +605,13 @@ static noinline int create_subvol(struct inode *dir,
601 BUG_ON(ret); 605 BUG_ON(ret);
602 606
603 ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root, 607 ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
604 root_item.uuid, BTRFS_UUID_KEY_SUBVOL, 608 root_item->uuid, BTRFS_UUID_KEY_SUBVOL,
605 objectid); 609 objectid);
606 if (ret) 610 if (ret)
607 btrfs_abort_transaction(trans, root, ret); 611 btrfs_abort_transaction(trans, root, ret);
608 612
609fail: 613fail:
614 kfree(root_item);
610 trans->block_rsv = NULL; 615 trans->block_rsv = NULL;
611 trans->bytes_reserved = 0; 616 trans->bytes_reserved = 0;
612 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); 617 btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
@@ -629,6 +634,10 @@ fail:
629 d_instantiate(dentry, inode); 634 d_instantiate(dentry, inode);
630 } 635 }
631 return ret; 636 return ret;
637
638fail_free:
639 kfree(root_item);
640 return ret;
632} 641}
633 642
634static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root) 643static void btrfs_wait_for_no_snapshoting_writes(struct btrfs_root *root)
@@ -681,7 +690,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
681 if (ret) 690 if (ret)
682 goto dec_and_free; 691 goto dec_and_free;
683 692
684 btrfs_wait_ordered_extents(root, -1); 693 btrfs_wait_ordered_extents(root, -1, 0, (u64)-1);
685 694
686 btrfs_init_block_rsv(&pending_snapshot->block_rsv, 695 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
687 BTRFS_BLOCK_RSV_TEMP); 696 BTRFS_BLOCK_RSV_TEMP);
@@ -2671,10 +2680,10 @@ out:
2671 return ret; 2680 return ret;
2672} 2681}
2673 2682
2674static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 2683static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
2675{ 2684{
2676 struct btrfs_root *root = BTRFS_I(file_inode(file))->root; 2685 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
2677 struct btrfs_ioctl_vol_args *vol_args; 2686 struct btrfs_ioctl_vol_args_v2 *vol_args;
2678 int ret; 2687 int ret;
2679 2688
2680 if (!capable(CAP_SYS_ADMIN)) 2689 if (!capable(CAP_SYS_ADMIN))
@@ -2690,7 +2699,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2690 goto err_drop; 2699 goto err_drop;
2691 } 2700 }
2692 2701
2693 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 2702 /* Check for compatibility reject unknown flags */
2703 if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED)
2704 return -EOPNOTSUPP;
2694 2705
2695 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 2706 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2696 1)) { 2707 1)) {
@@ -2699,13 +2710,23 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2699 } 2710 }
2700 2711
2701 mutex_lock(&root->fs_info->volume_mutex); 2712 mutex_lock(&root->fs_info->volume_mutex);
2702 ret = btrfs_rm_device(root, vol_args->name); 2713 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
2714 ret = btrfs_rm_device(root, NULL, vol_args->devid);
2715 } else {
2716 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
2717 ret = btrfs_rm_device(root, vol_args->name, 0);
2718 }
2703 mutex_unlock(&root->fs_info->volume_mutex); 2719 mutex_unlock(&root->fs_info->volume_mutex);
2704 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); 2720 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2705 2721
2706 if (!ret) 2722 if (!ret) {
2707 btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); 2723 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
2708 2724 btrfs_info(root->fs_info, "device deleted: id %llu",
2725 vol_args->devid);
2726 else
2727 btrfs_info(root->fs_info, "device deleted: %s",
2728 vol_args->name);
2729 }
2709out: 2730out:
2710 kfree(vol_args); 2731 kfree(vol_args);
2711err_drop: 2732err_drop:
@@ -2713,6 +2734,47 @@ err_drop:
2713 return ret; 2734 return ret;
2714} 2735}
2715 2736
2737static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2738{
2739 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
2740 struct btrfs_ioctl_vol_args *vol_args;
2741 int ret;
2742
2743 if (!capable(CAP_SYS_ADMIN))
2744 return -EPERM;
2745
2746 ret = mnt_want_write_file(file);
2747 if (ret)
2748 return ret;
2749
2750 if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
2751 1)) {
2752 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
2753 goto out_drop_write;
2754 }
2755
2756 vol_args = memdup_user(arg, sizeof(*vol_args));
2757 if (IS_ERR(vol_args)) {
2758 ret = PTR_ERR(vol_args);
2759 goto out;
2760 }
2761
2762 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
2763 mutex_lock(&root->fs_info->volume_mutex);
2764 ret = btrfs_rm_device(root, vol_args->name, 0);
2765 mutex_unlock(&root->fs_info->volume_mutex);
2766
2767 if (!ret)
2768 btrfs_info(root->fs_info, "disk deleted %s",vol_args->name);
2769 kfree(vol_args);
2770out:
2771 atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
2772out_drop_write:
2773 mnt_drop_write_file(file);
2774
2775 return ret;
2776}
2777
2716static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) 2778static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2717{ 2779{
2718 struct btrfs_ioctl_fs_info_args *fi_args; 2780 struct btrfs_ioctl_fs_info_args *fi_args;
@@ -3472,13 +3534,16 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
3472 u64 last_dest_end = destoff; 3534 u64 last_dest_end = destoff;
3473 3535
3474 ret = -ENOMEM; 3536 ret = -ENOMEM;
3475 buf = vmalloc(root->nodesize); 3537 buf = kmalloc(root->nodesize, GFP_KERNEL | __GFP_NOWARN);
3476 if (!buf) 3538 if (!buf) {
3477 return ret; 3539 buf = vmalloc(root->nodesize);
3540 if (!buf)
3541 return ret;
3542 }
3478 3543
3479 path = btrfs_alloc_path(); 3544 path = btrfs_alloc_path();
3480 if (!path) { 3545 if (!path) {
3481 vfree(buf); 3546 kvfree(buf);
3482 return ret; 3547 return ret;
3483 } 3548 }
3484 3549
@@ -3779,7 +3844,7 @@ process_slot:
3779 3844
3780out: 3845out:
3781 btrfs_free_path(path); 3846 btrfs_free_path(path);
3782 vfree(buf); 3847 kvfree(buf);
3783 return ret; 3848 return ret;
3784} 3849}
3785 3850
@@ -4380,7 +4445,7 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
4380 1)) { 4445 1)) {
4381 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; 4446 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
4382 } else { 4447 } else {
4383 ret = btrfs_dev_replace_start(root, p); 4448 ret = btrfs_dev_replace_by_ioctl(root, p);
4384 atomic_set( 4449 atomic_set(
4385 &root->fs_info->mutually_exclusive_operation_running, 4450 &root->fs_info->mutually_exclusive_operation_running,
4386 0); 4451 0);
@@ -4851,8 +4916,8 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
4851 /* update qgroup status and info */ 4916 /* update qgroup status and info */
4852 err = btrfs_run_qgroups(trans, root->fs_info); 4917 err = btrfs_run_qgroups(trans, root->fs_info);
4853 if (err < 0) 4918 if (err < 0)
4854 btrfs_std_error(root->fs_info, ret, 4919 btrfs_handle_fs_error(root->fs_info, err,
4855 "failed to update qgroup status and info\n"); 4920 "failed to update qgroup status and info");
4856 err = btrfs_end_transaction(trans, root); 4921 err = btrfs_end_transaction(trans, root);
4857 if (err && !ret) 4922 if (err && !ret)
4858 ret = err; 4923 ret = err;
@@ -5398,9 +5463,15 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
5398 if (ret) 5463 if (ret)
5399 return ret; 5464 return ret;
5400 5465
5466 ret = mnt_want_write_file(file);
5467 if (ret)
5468 return ret;
5469
5401 trans = btrfs_start_transaction(root, 0); 5470 trans = btrfs_start_transaction(root, 0);
5402 if (IS_ERR(trans)) 5471 if (IS_ERR(trans)) {
5403 return PTR_ERR(trans); 5472 ret = PTR_ERR(trans);
5473 goto out_drop_write;
5474 }
5404 5475
5405 spin_lock(&root->fs_info->super_lock); 5476 spin_lock(&root->fs_info->super_lock);
5406 newflags = btrfs_super_compat_flags(super_block); 5477 newflags = btrfs_super_compat_flags(super_block);
@@ -5419,7 +5490,11 @@ static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
5419 btrfs_set_super_incompat_flags(super_block, newflags); 5490 btrfs_set_super_incompat_flags(super_block, newflags);
5420 spin_unlock(&root->fs_info->super_lock); 5491 spin_unlock(&root->fs_info->super_lock);
5421 5492
5422 return btrfs_commit_transaction(trans, root); 5493 ret = btrfs_commit_transaction(trans, root);
5494out_drop_write:
5495 mnt_drop_write_file(file);
5496
5497 return ret;
5423} 5498}
5424 5499
5425long btrfs_ioctl(struct file *file, unsigned int 5500long btrfs_ioctl(struct file *file, unsigned int
@@ -5463,6 +5538,8 @@ long btrfs_ioctl(struct file *file, unsigned int
5463 return btrfs_ioctl_add_dev(root, argp); 5538 return btrfs_ioctl_add_dev(root, argp);
5464 case BTRFS_IOC_RM_DEV: 5539 case BTRFS_IOC_RM_DEV:
5465 return btrfs_ioctl_rm_dev(file, argp); 5540 return btrfs_ioctl_rm_dev(file, argp);
5541 case BTRFS_IOC_RM_DEV_V2:
5542 return btrfs_ioctl_rm_dev_v2(file, argp);
5466 case BTRFS_IOC_FS_INFO: 5543 case BTRFS_IOC_FS_INFO:
5467 return btrfs_ioctl_fs_info(root, argp); 5544 return btrfs_ioctl_fs_info(root, argp);
5468 case BTRFS_IOC_DEV_INFO: 5545 case BTRFS_IOC_DEV_INFO:
@@ -5556,3 +5633,24 @@ long btrfs_ioctl(struct file *file, unsigned int
5556 5633
5557 return -ENOTTY; 5634 return -ENOTTY;
5558} 5635}
5636
5637#ifdef CONFIG_COMPAT
5638long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5639{
5640 switch (cmd) {
5641 case FS_IOC32_GETFLAGS:
5642 cmd = FS_IOC_GETFLAGS;
5643 break;
5644 case FS_IOC32_SETFLAGS:
5645 cmd = FS_IOC_SETFLAGS;
5646 break;
5647 case FS_IOC32_GETVERSION:
5648 cmd = FS_IOC_GETVERSION;
5649 break;
5650 default:
5651 return -ENOIOCTLCMD;
5652 }
5653
5654 return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
5655}
5656#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 0de7da5a610d..559170464d7c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -661,14 +661,15 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
661 * wait for all the ordered extents in a root. This is done when balancing 661 * wait for all the ordered extents in a root. This is done when balancing
662 * space between drives. 662 * space between drives.
663 */ 663 */
664int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr) 664int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
665 const u64 range_start, const u64 range_len)
665{ 666{
666 struct list_head splice, works; 667 LIST_HEAD(splice);
668 LIST_HEAD(skipped);
669 LIST_HEAD(works);
667 struct btrfs_ordered_extent *ordered, *next; 670 struct btrfs_ordered_extent *ordered, *next;
668 int count = 0; 671 int count = 0;
669 672 const u64 range_end = range_start + range_len;
670 INIT_LIST_HEAD(&splice);
671 INIT_LIST_HEAD(&works);
672 673
673 mutex_lock(&root->ordered_extent_mutex); 674 mutex_lock(&root->ordered_extent_mutex);
674 spin_lock(&root->ordered_extent_lock); 675 spin_lock(&root->ordered_extent_lock);
@@ -676,6 +677,14 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
676 while (!list_empty(&splice) && nr) { 677 while (!list_empty(&splice) && nr) {
677 ordered = list_first_entry(&splice, struct btrfs_ordered_extent, 678 ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
678 root_extent_list); 679 root_extent_list);
680
681 if (range_end <= ordered->start ||
682 ordered->start + ordered->disk_len <= range_start) {
683 list_move_tail(&ordered->root_extent_list, &skipped);
684 cond_resched_lock(&root->ordered_extent_lock);
685 continue;
686 }
687
679 list_move_tail(&ordered->root_extent_list, 688 list_move_tail(&ordered->root_extent_list,
680 &root->ordered_extents); 689 &root->ordered_extents);
681 atomic_inc(&ordered->refs); 690 atomic_inc(&ordered->refs);
@@ -694,6 +703,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
694 nr--; 703 nr--;
695 count++; 704 count++;
696 } 705 }
706 list_splice_tail(&skipped, &root->ordered_extents);
697 list_splice_tail(&splice, &root->ordered_extents); 707 list_splice_tail(&splice, &root->ordered_extents);
698 spin_unlock(&root->ordered_extent_lock); 708 spin_unlock(&root->ordered_extent_lock);
699 709
@@ -708,7 +718,8 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr)
708 return count; 718 return count;
709} 719}
710 720
711void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr) 721void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
722 const u64 range_start, const u64 range_len)
712{ 723{
713 struct btrfs_root *root; 724 struct btrfs_root *root;
714 struct list_head splice; 725 struct list_head splice;
@@ -728,7 +739,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr)
728 &fs_info->ordered_roots); 739 &fs_info->ordered_roots);
729 spin_unlock(&fs_info->ordered_root_lock); 740 spin_unlock(&fs_info->ordered_root_lock);
730 741
731 done = btrfs_wait_ordered_extents(root, nr); 742 done = btrfs_wait_ordered_extents(root, nr,
743 range_start, range_len);
732 btrfs_put_fs_root(root); 744 btrfs_put_fs_root(root);
733 745
734 spin_lock(&fs_info->ordered_root_lock); 746 spin_lock(&fs_info->ordered_root_lock);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 23c96059cef2..8ef12623d65c 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -197,8 +197,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
197 struct btrfs_ordered_extent *ordered); 197 struct btrfs_ordered_extent *ordered);
198int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, 198int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
199 u32 *sum, int len); 199 u32 *sum, int len);
200int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr); 200int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr,
201void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr); 201 const u64 range_start, const u64 range_len);
202void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, int nr,
203 const u64 range_start, const u64 range_len);
202void btrfs_get_logged_extents(struct inode *inode, 204void btrfs_get_logged_extents(struct inode *inode,
203 struct list_head *logged_list, 205 struct list_head *logged_list,
204 const loff_t start, 206 const loff_t start,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 08ef890deca6..1cfd35cfac76 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2418,7 +2418,7 @@ again:
2418 } 2418 }
2419out: 2419out:
2420 if (ret) { 2420 if (ret) {
2421 btrfs_std_error(root->fs_info, ret, NULL); 2421 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2422 if (!list_empty(&reloc_roots)) 2422 if (!list_empty(&reloc_roots))
2423 free_reloc_roots(&reloc_roots); 2423 free_reloc_roots(&reloc_roots);
2424 2424
@@ -4254,12 +4254,11 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
4254 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu", 4254 btrfs_info(extent_root->fs_info, "relocating block group %llu flags %llu",
4255 rc->block_group->key.objectid, rc->block_group->flags); 4255 rc->block_group->key.objectid, rc->block_group->flags);
4256 4256
4257 ret = btrfs_start_delalloc_roots(fs_info, 0, -1); 4257 btrfs_wait_block_group_reservations(rc->block_group);
4258 if (ret < 0) { 4258 btrfs_wait_nocow_writers(rc->block_group);
4259 err = ret; 4259 btrfs_wait_ordered_roots(fs_info, -1,
4260 goto out; 4260 rc->block_group->key.objectid,
4261 } 4261 rc->block_group->key.offset);
4262 btrfs_wait_ordered_roots(fs_info, -1);
4263 4262
4264 while (1) { 4263 while (1) {
4265 mutex_lock(&fs_info->cleaner_mutex); 4264 mutex_lock(&fs_info->cleaner_mutex);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 9fcd6dfc3266..b2b14e7115f1 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -284,7 +284,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
284 trans = btrfs_join_transaction(tree_root); 284 trans = btrfs_join_transaction(tree_root);
285 if (IS_ERR(trans)) { 285 if (IS_ERR(trans)) {
286 err = PTR_ERR(trans); 286 err = PTR_ERR(trans);
287 btrfs_std_error(tree_root->fs_info, err, 287 btrfs_handle_fs_error(tree_root->fs_info, err,
288 "Failed to start trans to delete " 288 "Failed to start trans to delete "
289 "orphan item"); 289 "orphan item");
290 break; 290 break;
@@ -293,7 +293,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
293 root_key.objectid); 293 root_key.objectid);
294 btrfs_end_transaction(trans, tree_root); 294 btrfs_end_transaction(trans, tree_root);
295 if (err) { 295 if (err) {
296 btrfs_std_error(tree_root->fs_info, err, 296 btrfs_handle_fs_error(tree_root->fs_info, err,
297 "Failed to delete root orphan " 297 "Failed to delete root orphan "
298 "item"); 298 "item");
299 break; 299 break;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 4678f03e878e..fa35cdc46494 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1350,7 +1350,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1350 recover->bbio = bbio; 1350 recover->bbio = bbio;
1351 recover->map_length = mapped_length; 1351 recover->map_length = mapped_length;
1352 1352
1353 BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO); 1353 BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1354 1354
1355 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS); 1355 nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1356 1356
@@ -2127,6 +2127,8 @@ static void scrub_missing_raid56_end_io(struct bio *bio)
2127 if (bio->bi_error) 2127 if (bio->bi_error)
2128 sblock->no_io_error_seen = 0; 2128 sblock->no_io_error_seen = 0;
2129 2129
2130 bio_put(bio);
2131
2130 btrfs_queue_work(fs_info->scrub_workers, &sblock->work); 2132 btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2131} 2133}
2132 2134
@@ -2860,7 +2862,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2860 int extent_mirror_num; 2862 int extent_mirror_num;
2861 int stop_loop = 0; 2863 int stop_loop = 0;
2862 2864
2863 nsectors = map->stripe_len / root->sectorsize; 2865 nsectors = div_u64(map->stripe_len, root->sectorsize);
2864 bitmap_len = scrub_calc_parity_bitmap_len(nsectors); 2866 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2865 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, 2867 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2866 GFP_NOFS); 2868 GFP_NOFS);
@@ -3070,7 +3072,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3070 int slot; 3072 int slot;
3071 u64 nstripes; 3073 u64 nstripes;
3072 struct extent_buffer *l; 3074 struct extent_buffer *l;
3073 struct btrfs_key key;
3074 u64 physical; 3075 u64 physical;
3075 u64 logical; 3076 u64 logical;
3076 u64 logic_end; 3077 u64 logic_end;
@@ -3079,7 +3080,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3079 int mirror_num; 3080 int mirror_num;
3080 struct reada_control *reada1; 3081 struct reada_control *reada1;
3081 struct reada_control *reada2; 3082 struct reada_control *reada2;
3082 struct btrfs_key key_start; 3083 struct btrfs_key key;
3083 struct btrfs_key key_end; 3084 struct btrfs_key key_end;
3084 u64 increment = map->stripe_len; 3085 u64 increment = map->stripe_len;
3085 u64 offset; 3086 u64 offset;
@@ -3158,21 +3159,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3158 scrub_blocked_if_needed(fs_info); 3159 scrub_blocked_if_needed(fs_info);
3159 3160
3160 /* FIXME it might be better to start readahead at commit root */ 3161 /* FIXME it might be better to start readahead at commit root */
3161 key_start.objectid = logical; 3162 key.objectid = logical;
3162 key_start.type = BTRFS_EXTENT_ITEM_KEY; 3163 key.type = BTRFS_EXTENT_ITEM_KEY;
3163 key_start.offset = (u64)0; 3164 key.offset = (u64)0;
3164 key_end.objectid = logic_end; 3165 key_end.objectid = logic_end;
3165 key_end.type = BTRFS_METADATA_ITEM_KEY; 3166 key_end.type = BTRFS_METADATA_ITEM_KEY;
3166 key_end.offset = (u64)-1; 3167 key_end.offset = (u64)-1;
3167 reada1 = btrfs_reada_add(root, &key_start, &key_end); 3168 reada1 = btrfs_reada_add(root, &key, &key_end);
3168 3169
3169 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 3170 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3170 key_start.type = BTRFS_EXTENT_CSUM_KEY; 3171 key.type = BTRFS_EXTENT_CSUM_KEY;
3171 key_start.offset = logical; 3172 key.offset = logical;
3172 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 3173 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3173 key_end.type = BTRFS_EXTENT_CSUM_KEY; 3174 key_end.type = BTRFS_EXTENT_CSUM_KEY;
3174 key_end.offset = logic_end; 3175 key_end.offset = logic_end;
3175 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); 3176 reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3176 3177
3177 if (!IS_ERR(reada1)) 3178 if (!IS_ERR(reada1))
3178 btrfs_reada_wait(reada1); 3179 btrfs_reada_wait(reada1);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8d358c547c59..6a8c86074aa4 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5939,6 +5939,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5939 u32 i; 5939 u32 i;
5940 u64 *clone_sources_tmp = NULL; 5940 u64 *clone_sources_tmp = NULL;
5941 int clone_sources_to_rollback = 0; 5941 int clone_sources_to_rollback = 0;
5942 unsigned alloc_size;
5942 int sort_clone_roots = 0; 5943 int sort_clone_roots = 0;
5943 int index; 5944 int index;
5944 5945
@@ -5978,6 +5979,12 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
5978 goto out; 5979 goto out;
5979 } 5980 }
5980 5981
5982 if (arg->clone_sources_count >
5983 ULLONG_MAX / sizeof(*arg->clone_sources)) {
5984 ret = -EINVAL;
5985 goto out;
5986 }
5987
5981 if (!access_ok(VERIFY_READ, arg->clone_sources, 5988 if (!access_ok(VERIFY_READ, arg->clone_sources,
5982 sizeof(*arg->clone_sources) * 5989 sizeof(*arg->clone_sources) *
5983 arg->clone_sources_count)) { 5990 arg->clone_sources_count)) {
@@ -6022,40 +6029,53 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
6022 sctx->clone_roots_cnt = arg->clone_sources_count; 6029 sctx->clone_roots_cnt = arg->clone_sources_count;
6023 6030
6024 sctx->send_max_size = BTRFS_SEND_BUF_SIZE; 6031 sctx->send_max_size = BTRFS_SEND_BUF_SIZE;
6025 sctx->send_buf = vmalloc(sctx->send_max_size); 6032 sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN);
6026 if (!sctx->send_buf) { 6033 if (!sctx->send_buf) {
6027 ret = -ENOMEM; 6034 sctx->send_buf = vmalloc(sctx->send_max_size);
6028 goto out; 6035 if (!sctx->send_buf) {
6036 ret = -ENOMEM;
6037 goto out;
6038 }
6029 } 6039 }
6030 6040
6031 sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); 6041 sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN);
6032 if (!sctx->read_buf) { 6042 if (!sctx->read_buf) {
6033 ret = -ENOMEM; 6043 sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE);
6034 goto out; 6044 if (!sctx->read_buf) {
6045 ret = -ENOMEM;
6046 goto out;
6047 }
6035 } 6048 }
6036 6049
6037 sctx->pending_dir_moves = RB_ROOT; 6050 sctx->pending_dir_moves = RB_ROOT;
6038 sctx->waiting_dir_moves = RB_ROOT; 6051 sctx->waiting_dir_moves = RB_ROOT;
6039 sctx->orphan_dirs = RB_ROOT; 6052 sctx->orphan_dirs = RB_ROOT;
6040 6053
6041 sctx->clone_roots = vzalloc(sizeof(struct clone_root) * 6054 alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1);
6042 (arg->clone_sources_count + 1)); 6055
6056 sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
6043 if (!sctx->clone_roots) { 6057 if (!sctx->clone_roots) {
6044 ret = -ENOMEM; 6058 sctx->clone_roots = vzalloc(alloc_size);
6045 goto out; 6059 if (!sctx->clone_roots) {
6060 ret = -ENOMEM;
6061 goto out;
6062 }
6046 } 6063 }
6047 6064
6065 alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources);
6066
6048 if (arg->clone_sources_count) { 6067 if (arg->clone_sources_count) {
6049 clone_sources_tmp = vmalloc(arg->clone_sources_count * 6068 clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN);
6050 sizeof(*arg->clone_sources));
6051 if (!clone_sources_tmp) { 6069 if (!clone_sources_tmp) {
6052 ret = -ENOMEM; 6070 clone_sources_tmp = vmalloc(alloc_size);
6053 goto out; 6071 if (!clone_sources_tmp) {
6072 ret = -ENOMEM;
6073 goto out;
6074 }
6054 } 6075 }
6055 6076
6056 ret = copy_from_user(clone_sources_tmp, arg->clone_sources, 6077 ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
6057 arg->clone_sources_count * 6078 alloc_size);
6058 sizeof(*arg->clone_sources));
6059 if (ret) { 6079 if (ret) {
6060 ret = -EFAULT; 6080 ret = -EFAULT;
6061 goto out; 6081 goto out;
@@ -6089,7 +6109,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
6089 sctx->clone_roots[i].root = clone_root; 6109 sctx->clone_roots[i].root = clone_root;
6090 clone_sources_to_rollback = i + 1; 6110 clone_sources_to_rollback = i + 1;
6091 } 6111 }
6092 vfree(clone_sources_tmp); 6112 kvfree(clone_sources_tmp);
6093 clone_sources_tmp = NULL; 6113 clone_sources_tmp = NULL;
6094 } 6114 }
6095 6115
@@ -6207,15 +6227,15 @@ out:
6207 btrfs_root_dec_send_in_progress(sctx->parent_root); 6227 btrfs_root_dec_send_in_progress(sctx->parent_root);
6208 6228
6209 kfree(arg); 6229 kfree(arg);
6210 vfree(clone_sources_tmp); 6230 kvfree(clone_sources_tmp);
6211 6231
6212 if (sctx) { 6232 if (sctx) {
6213 if (sctx->send_filp) 6233 if (sctx->send_filp)
6214 fput(sctx->send_filp); 6234 fput(sctx->send_filp);
6215 6235
6216 vfree(sctx->clone_roots); 6236 kvfree(sctx->clone_roots);
6217 vfree(sctx->send_buf); 6237 kvfree(sctx->send_buf);
6218 vfree(sctx->read_buf); 6238 kvfree(sctx->read_buf);
6219 6239
6220 name_cache_free(sctx); 6240 name_cache_free(sctx);
6221 6241
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 00b8f37cc306..bf71071ab6f6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -97,15 +97,6 @@ const char *btrfs_decode_error(int errno)
97 return errstr; 97 return errstr;
98} 98}
99 99
100static void save_error_info(struct btrfs_fs_info *fs_info)
101{
102 /*
103 * today we only save the error info into ram. Long term we'll
104 * also send it down to the disk
105 */
106 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
107}
108
109/* btrfs handle error by forcing the filesystem readonly */ 100/* btrfs handle error by forcing the filesystem readonly */
110static void btrfs_handle_error(struct btrfs_fs_info *fs_info) 101static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
111{ 102{
@@ -131,11 +122,11 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
131} 122}
132 123
133/* 124/*
134 * __btrfs_std_error decodes expected errors from the caller and 125 * __btrfs_handle_fs_error decodes expected errors from the caller and
135 * invokes the approciate error response. 126 * invokes the approciate error response.
136 */ 127 */
137__cold 128__cold
138void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, 129void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
139 unsigned int line, int errno, const char *fmt, ...) 130 unsigned int line, int errno, const char *fmt, ...)
140{ 131{
141 struct super_block *sb = fs_info->sb; 132 struct super_block *sb = fs_info->sb;
@@ -170,8 +161,13 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
170 } 161 }
171#endif 162#endif
172 163
164 /*
165 * Today we only save the error info to memory. Long term we'll
166 * also send it down to the disk
167 */
168 set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
169
173 /* Don't go through full error handling during mount */ 170 /* Don't go through full error handling during mount */
174 save_error_info(fs_info);
175 if (sb->s_flags & MS_BORN) 171 if (sb->s_flags & MS_BORN)
176 btrfs_handle_error(fs_info); 172 btrfs_handle_error(fs_info);
177} 173}
@@ -252,7 +248,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
252 /* Wake up anybody who may be waiting on this transaction */ 248 /* Wake up anybody who may be waiting on this transaction */
253 wake_up(&root->fs_info->transaction_wait); 249 wake_up(&root->fs_info->transaction_wait);
254 wake_up(&root->fs_info->transaction_blocked_wait); 250 wake_up(&root->fs_info->transaction_blocked_wait);
255 __btrfs_std_error(root->fs_info, function, line, errno, NULL); 251 __btrfs_handle_fs_error(root->fs_info, function, line, errno, NULL);
256} 252}
257/* 253/*
258 * __btrfs_panic decodes unexpected, fatal errors from the caller, 254 * __btrfs_panic decodes unexpected, fatal errors from the caller,
@@ -1160,7 +1156,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
1160 return 0; 1156 return 0;
1161 } 1157 }
1162 1158
1163 btrfs_wait_ordered_roots(fs_info, -1); 1159 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
1164 1160
1165 trans = btrfs_attach_transaction_barrier(root); 1161 trans = btrfs_attach_transaction_barrier(root);
1166 if (IS_ERR(trans)) { 1162 if (IS_ERR(trans)) {
@@ -1488,10 +1484,10 @@ static int setup_security_options(struct btrfs_fs_info *fs_info,
1488 memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); 1484 memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts));
1489 } else { 1485 } else {
1490 /* 1486 /*
1491 * Since SELinux(the only one supports security_mnt_opts) does 1487 * Since SELinux (the only one supporting security_mnt_opts)
1492 * NOT support changing context during remount/mount same sb, 1488 * does NOT support changing context during remount/mount of
1493 * This must be the same or part of the same security options, 1489 * the same sb, this must be the same or part of the same
1494 * just free it. 1490 * security options, just free it.
1495 */ 1491 */
1496 security_free_mnt_opts(sec_opts); 1492 security_free_mnt_opts(sec_opts);
1497 } 1493 }
@@ -1669,8 +1665,8 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
1669 unsigned long old_opts) 1665 unsigned long old_opts)
1670{ 1666{
1671 /* 1667 /*
1672 * We need cleanup all defragable inodes if the autodefragment is 1668 * We need to cleanup all defragable inodes if the autodefragment is
1673 * close or the fs is R/O. 1669 * close or the filesystem is read only.
1674 */ 1670 */
1675 if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) && 1671 if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
1676 (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) || 1672 (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
@@ -2051,9 +2047,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
2051 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; 2047 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
2052 int ret; 2048 int ret;
2053 u64 thresh = 0; 2049 u64 thresh = 0;
2050 int mixed = 0;
2054 2051
2055 /* 2052 /*
2056 * holding chunk_muext to avoid allocating new chunks, holding 2053 * holding chunk_mutex to avoid allocating new chunks, holding
2057 * device_list_mutex to avoid the device being removed 2054 * device_list_mutex to avoid the device being removed
2058 */ 2055 */
2059 rcu_read_lock(); 2056 rcu_read_lock();
@@ -2076,8 +2073,17 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
2076 } 2073 }
2077 } 2074 }
2078 } 2075 }
2079 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 2076
2080 total_free_meta += found->disk_total - found->disk_used; 2077 /*
2078 * Metadata in mixed block goup profiles are accounted in data
2079 */
2080 if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) {
2081 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
2082 mixed = 1;
2083 else
2084 total_free_meta += found->disk_total -
2085 found->disk_used;
2086 }
2081 2087
2082 total_used += found->disk_used; 2088 total_used += found->disk_used;
2083 } 2089 }
@@ -2090,7 +2096,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
2090 2096
2091 /* Account global block reserve as used, it's in logical size already */ 2097 /* Account global block reserve as used, it's in logical size already */
2092 spin_lock(&block_rsv->lock); 2098 spin_lock(&block_rsv->lock);
2093 buf->f_bfree -= block_rsv->size >> bits; 2099 /* Mixed block groups accounting is not byte-accurate, avoid overflow */
2100 if (buf->f_bfree >= block_rsv->size >> bits)
2101 buf->f_bfree -= block_rsv->size >> bits;
2102 else
2103 buf->f_bfree = 0;
2094 spin_unlock(&block_rsv->lock); 2104 spin_unlock(&block_rsv->lock);
2095 2105
2096 buf->f_bavail = div_u64(total_free_data, factor); 2106 buf->f_bavail = div_u64(total_free_data, factor);
@@ -2115,7 +2125,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
2115 */ 2125 */
2116 thresh = 4 * 1024 * 1024; 2126 thresh = 4 * 1024 * 1024;
2117 2127
2118 if (total_free_meta - thresh < block_rsv->size) 2128 if (!mixed && total_free_meta - thresh < block_rsv->size)
2119 buf->f_bavail = 0; 2129 buf->f_bavail = 0;
2120 2130
2121 buf->f_type = BTRFS_SUPER_MAGIC; 2131 buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 539e7b5e3f86..4879656bda3c 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -120,6 +120,9 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
120 if (!fs_info) 120 if (!fs_info)
121 return -EPERM; 121 return -EPERM;
122 122
123 if (fs_info->sb->s_flags & MS_RDONLY)
124 return -EROFS;
125
123 ret = kstrtoul(skip_spaces(buf), 0, &val); 126 ret = kstrtoul(skip_spaces(buf), 0, &val);
124 if (ret) 127 if (ret)
125 return ret; 128 return ret;
@@ -364,7 +367,13 @@ static ssize_t btrfs_label_show(struct kobject *kobj,
364{ 367{
365 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 368 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
366 char *label = fs_info->super_copy->label; 369 char *label = fs_info->super_copy->label;
367 return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); 370 ssize_t ret;
371
372 spin_lock(&fs_info->super_lock);
373 ret = snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label);
374 spin_unlock(&fs_info->super_lock);
375
376 return ret;
368} 377}
369 378
370static ssize_t btrfs_label_store(struct kobject *kobj, 379static ssize_t btrfs_label_store(struct kobject *kobj,
@@ -374,6 +383,9 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
374 struct btrfs_fs_info *fs_info = to_fs_info(kobj); 383 struct btrfs_fs_info *fs_info = to_fs_info(kobj);
375 size_t p_len; 384 size_t p_len;
376 385
386 if (!fs_info)
387 return -EPERM;
388
377 if (fs_info->sb->s_flags & MS_RDONLY) 389 if (fs_info->sb->s_flags & MS_RDONLY)
378 return -EROFS; 390 return -EROFS;
379 391
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 43885e51b882..5b0b758a3f79 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -311,10 +311,11 @@ loop:
311 * when the transaction commits 311 * when the transaction commits
312 */ 312 */
313static int record_root_in_trans(struct btrfs_trans_handle *trans, 313static int record_root_in_trans(struct btrfs_trans_handle *trans,
314 struct btrfs_root *root) 314 struct btrfs_root *root,
315 int force)
315{ 316{
316 if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) && 317 if ((test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
317 root->last_trans < trans->transid) { 318 root->last_trans < trans->transid) || force) {
318 WARN_ON(root == root->fs_info->extent_root); 319 WARN_ON(root == root->fs_info->extent_root);
319 WARN_ON(root->commit_root != root->node); 320 WARN_ON(root->commit_root != root->node);
320 321
@@ -331,7 +332,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
331 smp_wmb(); 332 smp_wmb();
332 333
333 spin_lock(&root->fs_info->fs_roots_radix_lock); 334 spin_lock(&root->fs_info->fs_roots_radix_lock);
334 if (root->last_trans == trans->transid) { 335 if (root->last_trans == trans->transid && !force) {
335 spin_unlock(&root->fs_info->fs_roots_radix_lock); 336 spin_unlock(&root->fs_info->fs_roots_radix_lock);
336 return 0; 337 return 0;
337 } 338 }
@@ -402,7 +403,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
402 return 0; 403 return 0;
403 404
404 mutex_lock(&root->fs_info->reloc_mutex); 405 mutex_lock(&root->fs_info->reloc_mutex);
405 record_root_in_trans(trans, root); 406 record_root_in_trans(trans, root, 0);
406 mutex_unlock(&root->fs_info->reloc_mutex); 407 mutex_unlock(&root->fs_info->reloc_mutex);
407 408
408 return 0; 409 return 0;
@@ -1310,6 +1311,97 @@ int btrfs_defrag_root(struct btrfs_root *root)
1310 return ret; 1311 return ret;
1311} 1312}
1312 1313
1314/* Bisesctability fixup, remove in 4.8 */
1315#ifndef btrfs_std_error
1316#define btrfs_std_error btrfs_handle_fs_error
1317#endif
1318
1319/*
1320 * Do all special snapshot related qgroup dirty hack.
1321 *
1322 * Will do all needed qgroup inherit and dirty hack like switch commit
1323 * roots inside one transaction and write all btree into disk, to make
1324 * qgroup works.
1325 */
1326static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
1327 struct btrfs_root *src,
1328 struct btrfs_root *parent,
1329 struct btrfs_qgroup_inherit *inherit,
1330 u64 dst_objectid)
1331{
1332 struct btrfs_fs_info *fs_info = src->fs_info;
1333 int ret;
1334
1335 /*
1336 * Save some performance in the case that qgroups are not
1337 * enabled. If this check races with the ioctl, rescan will
1338 * kick in anyway.
1339 */
1340 mutex_lock(&fs_info->qgroup_ioctl_lock);
1341 if (!fs_info->quota_enabled) {
1342 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1343 return 0;
1344 }
1345 mutex_unlock(&fs_info->qgroup_ioctl_lock);
1346
1347 /*
1348 * We are going to commit transaction, see btrfs_commit_transaction()
1349 * comment for reason locking tree_log_mutex
1350 */
1351 mutex_lock(&fs_info->tree_log_mutex);
1352
1353 ret = commit_fs_roots(trans, src);
1354 if (ret)
1355 goto out;
1356 ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
1357 if (ret < 0)
1358 goto out;
1359 ret = btrfs_qgroup_account_extents(trans, fs_info);
1360 if (ret < 0)
1361 goto out;
1362
1363 /* Now qgroup are all updated, we can inherit it to new qgroups */
1364 ret = btrfs_qgroup_inherit(trans, fs_info,
1365 src->root_key.objectid, dst_objectid,
1366 inherit);
1367 if (ret < 0)
1368 goto out;
1369
1370 /*
1371 * Now we do a simplified commit transaction, which will:
1372 * 1) commit all subvolume and extent tree
1373 * To ensure all subvolume and extent tree have a valid
1374 * commit_root to accounting later insert_dir_item()
1375 * 2) write all btree blocks onto disk
1376 * This is to make sure later btree modification will be cowed
1377 * Or commit_root can be populated and cause wrong qgroup numbers
1378 * In this simplified commit, we don't really care about other trees
1379 * like chunk and root tree, as they won't affect qgroup.
1380 * And we don't write super to avoid half committed status.
1381 */
1382 ret = commit_cowonly_roots(trans, src);
1383 if (ret)
1384 goto out;
1385 switch_commit_roots(trans->transaction, fs_info);
1386 ret = btrfs_write_and_wait_transaction(trans, src);
1387 if (ret)
1388 btrfs_std_error(fs_info, ret,
1389 "Error while writing out transaction for qgroup");
1390
1391out:
1392 mutex_unlock(&fs_info->tree_log_mutex);
1393
1394 /*
1395 * Force parent root to be updated, as we recorded it before so its
1396 * last_trans == cur_transid.
1397 * Or it won't be committed again onto disk after later
1398 * insert_dir_item()
1399 */
1400 if (!ret)
1401 record_root_in_trans(trans, parent, 1);
1402 return ret;
1403}
1404
1313/* 1405/*
1314 * new snapshots need to be created at a very specific time in the 1406 * new snapshots need to be created at a very specific time in the
1315 * transaction commit. This does the actual creation. 1407 * transaction commit. This does the actual creation.
@@ -1383,7 +1475,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1383 dentry = pending->dentry; 1475 dentry = pending->dentry;
1384 parent_inode = pending->dir; 1476 parent_inode = pending->dir;
1385 parent_root = BTRFS_I(parent_inode)->root; 1477 parent_root = BTRFS_I(parent_inode)->root;
1386 record_root_in_trans(trans, parent_root); 1478 record_root_in_trans(trans, parent_root, 0);
1387 1479
1388 cur_time = current_fs_time(parent_inode->i_sb); 1480 cur_time = current_fs_time(parent_inode->i_sb);
1389 1481
@@ -1420,7 +1512,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1420 goto fail; 1512 goto fail;
1421 } 1513 }
1422 1514
1423 record_root_in_trans(trans, root); 1515 record_root_in_trans(trans, root, 0);
1424 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 1516 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
1425 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 1517 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
1426 btrfs_check_and_init_root_item(new_root_item); 1518 btrfs_check_and_init_root_item(new_root_item);
@@ -1516,6 +1608,17 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1516 goto fail; 1608 goto fail;
1517 } 1609 }
1518 1610
1611 /*
1612 * Do special qgroup accounting for snapshot, as we do some qgroup
1613 * snapshot hack to do fast snapshot.
1614 * To co-operate with that hack, we do hack again.
1615 * Or snapshot will be greatly slowed down by a subtree qgroup rescan
1616 */
1617 ret = qgroup_account_snapshot(trans, root, parent_root,
1618 pending->inherit, objectid);
1619 if (ret < 0)
1620 goto fail;
1621
1519 ret = btrfs_insert_dir_item(trans, parent_root, 1622 ret = btrfs_insert_dir_item(trans, parent_root,
1520 dentry->d_name.name, dentry->d_name.len, 1623 dentry->d_name.name, dentry->d_name.len,
1521 parent_inode, &key, 1624 parent_inode, &key,
@@ -1559,23 +1662,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1559 goto fail; 1662 goto fail;
1560 } 1663 }
1561 1664
1562 /*
1563 * account qgroup counters before qgroup_inherit()
1564 */
1565 ret = btrfs_qgroup_prepare_account_extents(trans, fs_info);
1566 if (ret)
1567 goto fail;
1568 ret = btrfs_qgroup_account_extents(trans, fs_info);
1569 if (ret)
1570 goto fail;
1571 ret = btrfs_qgroup_inherit(trans, fs_info,
1572 root->root_key.objectid,
1573 objectid, pending->inherit);
1574 if (ret) {
1575 btrfs_abort_transaction(trans, root, ret);
1576 goto fail;
1577 }
1578
1579fail: 1665fail:
1580 pending->error = ret; 1666 pending->error = ret;
1581dir_item_existed: 1667dir_item_existed:
@@ -1821,7 +1907,7 @@ static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
1821static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info) 1907static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
1822{ 1908{
1823 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT)) 1909 if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
1824 btrfs_wait_ordered_roots(fs_info, -1); 1910 btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
1825} 1911}
1826 1912
1827static inline void 1913static inline void
@@ -2145,7 +2231,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
2145 2231
2146 ret = btrfs_write_and_wait_transaction(trans, root); 2232 ret = btrfs_write_and_wait_transaction(trans, root);
2147 if (ret) { 2233 if (ret) {
2148 btrfs_std_error(root->fs_info, ret, 2234 btrfs_handle_fs_error(root->fs_info, ret,
2149 "Error while writing out transaction"); 2235 "Error while writing out transaction");
2150 mutex_unlock(&root->fs_info->tree_log_mutex); 2236 mutex_unlock(&root->fs_info->tree_log_mutex);
2151 goto scrub_continue; 2237 goto scrub_continue;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index e692eea87af6..8aaca5c6af94 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4141,6 +4141,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4141 4141
4142 INIT_LIST_HEAD(&extents); 4142 INIT_LIST_HEAD(&extents);
4143 4143
4144 down_write(&BTRFS_I(inode)->dio_sem);
4144 write_lock(&tree->lock); 4145 write_lock(&tree->lock);
4145 test_gen = root->fs_info->last_trans_committed; 4146 test_gen = root->fs_info->last_trans_committed;
4146 4147
@@ -4169,13 +4170,20 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4169 } 4170 }
4170 4171
4171 list_sort(NULL, &extents, extent_cmp); 4172 list_sort(NULL, &extents, extent_cmp);
4173 btrfs_get_logged_extents(inode, logged_list, start, end);
4172 /* 4174 /*
4173 * Collect any new ordered extents within the range. This is to 4175 * Some ordered extents started by fsync might have completed
4174 * prevent logging file extent items without waiting for the disk 4176 * before we could collect them into the list logged_list, which
4175 * location they point to being written. We do this only to deal 4177 * means they're gone, not in our logged_list nor in the inode's
4176 * with races against concurrent lockless direct IO writes. 4178 * ordered tree. We want the application/user space to know an
4179 * error happened while attempting to persist file data so that
4180 * it can take proper action. If such error happened, we leave
4181 * without writing to the log tree and the fsync must report the
4182 * file data write error and not commit the current transaction.
4177 */ 4183 */
4178 btrfs_get_logged_extents(inode, logged_list, start, end); 4184 ret = btrfs_inode_check_errors(inode);
4185 if (ret)
4186 ctx->io_err = ret;
4179process: 4187process:
4180 while (!list_empty(&extents)) { 4188 while (!list_empty(&extents)) {
4181 em = list_entry(extents.next, struct extent_map, list); 4189 em = list_entry(extents.next, struct extent_map, list);
@@ -4202,6 +4210,7 @@ process:
4202 } 4210 }
4203 WARN_ON(!list_empty(&extents)); 4211 WARN_ON(!list_empty(&extents));
4204 write_unlock(&tree->lock); 4212 write_unlock(&tree->lock);
4213 up_write(&BTRFS_I(inode)->dio_sem);
4205 4214
4206 btrfs_release_path(path); 4215 btrfs_release_path(path);
4207 return ret; 4216 return ret;
@@ -4623,23 +4632,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
4623 mutex_lock(&BTRFS_I(inode)->log_mutex); 4632 mutex_lock(&BTRFS_I(inode)->log_mutex);
4624 4633
4625 /* 4634 /*
4626 * Collect ordered extents only if we are logging data. This is to
4627 * ensure a subsequent request to log this inode in LOG_INODE_ALL mode
4628 * will process the ordered extents if they still exists at the time,
4629 * because when we collect them we test and set for the flag
4630 * BTRFS_ORDERED_LOGGED to prevent multiple log requests to process the
4631 * same ordered extents. The consequence for the LOG_INODE_ALL log mode
4632 * not processing the ordered extents is that we end up logging the
4633 * corresponding file extent items, based on the extent maps in the
4634 * inode's extent_map_tree's modified_list, without logging the
4635 * respective checksums (since the may still be only attached to the
4636 * ordered extents and have not been inserted in the csum tree by
4637 * btrfs_finish_ordered_io() yet).
4638 */
4639 if (inode_only == LOG_INODE_ALL)
4640 btrfs_get_logged_extents(inode, &logged_list, start, end);
4641
4642 /*
4643 * a brute force approach to making sure we get the most uptodate 4635 * a brute force approach to making sure we get the most uptodate
4644 * copies of everything. 4636 * copies of everything.
4645 */ 4637 */
@@ -4846,21 +4838,6 @@ log_extents:
4846 goto out_unlock; 4838 goto out_unlock;
4847 } 4839 }
4848 if (fast_search) { 4840 if (fast_search) {
4849 /*
4850 * Some ordered extents started by fsync might have completed
4851 * before we collected the ordered extents in logged_list, which
4852 * means they're gone, not in our logged_list nor in the inode's
4853 * ordered tree. We want the application/user space to know an
4854 * error happened while attempting to persist file data so that
4855 * it can take proper action. If such error happened, we leave
4856 * without writing to the log tree and the fsync must report the
4857 * file data write error and not commit the current transaction.
4858 */
4859 err = btrfs_inode_check_errors(inode);
4860 if (err) {
4861 ctx->io_err = err;
4862 goto out_unlock;
4863 }
4864 ret = btrfs_log_changed_extents(trans, root, inode, dst_path, 4841 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
4865 &logged_list, ctx, start, end); 4842 &logged_list, ctx, start, end);
4866 if (ret) { 4843 if (ret) {
@@ -5158,7 +5135,7 @@ process_leaf:
5158 } 5135 }
5159 5136
5160 ctx->log_new_dentries = false; 5137 ctx->log_new_dentries = false;
5161 if (type == BTRFS_FT_DIR) 5138 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
5162 log_mode = LOG_INODE_ALL; 5139 log_mode = LOG_INODE_ALL;
5163 btrfs_release_path(path); 5140 btrfs_release_path(path);
5164 ret = btrfs_log_inode(trans, root, di_inode, 5141 ret = btrfs_log_inode(trans, root, di_inode,
@@ -5278,11 +5255,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5278 if (IS_ERR(dir_inode)) 5255 if (IS_ERR(dir_inode))
5279 continue; 5256 continue;
5280 5257
5258 if (ctx)
5259 ctx->log_new_dentries = false;
5281 ret = btrfs_log_inode(trans, root, dir_inode, 5260 ret = btrfs_log_inode(trans, root, dir_inode,
5282 LOG_INODE_ALL, 0, LLONG_MAX, ctx); 5261 LOG_INODE_ALL, 0, LLONG_MAX, ctx);
5283 if (!ret && 5262 if (!ret &&
5284 btrfs_must_commit_transaction(trans, dir_inode)) 5263 btrfs_must_commit_transaction(trans, dir_inode))
5285 ret = 1; 5264 ret = 1;
5265 if (!ret && ctx && ctx->log_new_dentries)
5266 ret = log_new_dir_dentries(trans, root,
5267 dir_inode, ctx);
5286 iput(dir_inode); 5268 iput(dir_inode);
5287 if (ret) 5269 if (ret)
5288 goto out; 5270 goto out;
@@ -5519,7 +5501,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
5519 5501
5520 ret = walk_log_tree(trans, log_root_tree, &wc); 5502 ret = walk_log_tree(trans, log_root_tree, &wc);
5521 if (ret) { 5503 if (ret) {
5522 btrfs_std_error(fs_info, ret, "Failed to pin buffers while " 5504 btrfs_handle_fs_error(fs_info, ret, "Failed to pin buffers while "
5523 "recovering log root tree."); 5505 "recovering log root tree.");
5524 goto error; 5506 goto error;
5525 } 5507 }
@@ -5533,7 +5515,7 @@ again:
5533 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); 5515 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
5534 5516
5535 if (ret < 0) { 5517 if (ret < 0) {
5536 btrfs_std_error(fs_info, ret, 5518 btrfs_handle_fs_error(fs_info, ret,
5537 "Couldn't find tree log root."); 5519 "Couldn't find tree log root.");
5538 goto error; 5520 goto error;
5539 } 5521 }
@@ -5551,7 +5533,7 @@ again:
5551 log = btrfs_read_fs_root(log_root_tree, &found_key); 5533 log = btrfs_read_fs_root(log_root_tree, &found_key);
5552 if (IS_ERR(log)) { 5534 if (IS_ERR(log)) {
5553 ret = PTR_ERR(log); 5535 ret = PTR_ERR(log);
5554 btrfs_std_error(fs_info, ret, 5536 btrfs_handle_fs_error(fs_info, ret,
5555 "Couldn't read tree log root."); 5537 "Couldn't read tree log root.");
5556 goto error; 5538 goto error;
5557 } 5539 }
@@ -5566,7 +5548,7 @@ again:
5566 free_extent_buffer(log->node); 5548 free_extent_buffer(log->node);
5567 free_extent_buffer(log->commit_root); 5549 free_extent_buffer(log->commit_root);
5568 kfree(log); 5550 kfree(log);
5569 btrfs_std_error(fs_info, ret, "Couldn't read target root " 5551 btrfs_handle_fs_error(fs_info, ret, "Couldn't read target root "
5570 "for tree log recovery."); 5552 "for tree log recovery.");
5571 goto error; 5553 goto error;
5572 } 5554 }
@@ -5652,11 +5634,9 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
5652 * into the file. When the file is logged we check it and 5634 * into the file. When the file is logged we check it and
5653 * don't log the parents if the file is fully on disk. 5635 * don't log the parents if the file is fully on disk.
5654 */ 5636 */
5655 if (S_ISREG(inode->i_mode)) { 5637 mutex_lock(&BTRFS_I(inode)->log_mutex);
5656 mutex_lock(&BTRFS_I(inode)->log_mutex); 5638 BTRFS_I(inode)->last_unlink_trans = trans->transid;
5657 BTRFS_I(inode)->last_unlink_trans = trans->transid; 5639 mutex_unlock(&BTRFS_I(inode)->log_mutex);
5658 mutex_unlock(&BTRFS_I(inode)->log_mutex);
5659 }
5660 5640
5661 /* 5641 /*
5662 * if this directory was already logged any new 5642 * if this directory was already logged any new
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bfb80da3e6eb..2b88127bba5b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -118,6 +118,21 @@ const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
118 [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6, 118 [BTRFS_RAID_RAID6] = BTRFS_BLOCK_GROUP_RAID6,
119}; 119};
120 120
121/*
122 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
123 * condition is not met. Zero means there's no corresponding
124 * BTRFS_ERROR_DEV_*_NOT_MET value.
125 */
126const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
127 [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
128 [BTRFS_RAID_RAID1] = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
129 [BTRFS_RAID_DUP] = 0,
130 [BTRFS_RAID_RAID0] = 0,
131 [BTRFS_RAID_SINGLE] = 0,
132 [BTRFS_RAID_RAID5] = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
133 [BTRFS_RAID_RAID6] = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
134};
135
121static int init_first_rw_device(struct btrfs_trans_handle *trans, 136static int init_first_rw_device(struct btrfs_trans_handle *trans,
122 struct btrfs_root *root, 137 struct btrfs_root *root,
123 struct btrfs_device *device); 138 struct btrfs_device *device);
@@ -699,7 +714,8 @@ static noinline int device_list_add(const char *path,
699 * if there is new btrfs on an already registered device, 714 * if there is new btrfs on an already registered device,
700 * then remove the stale device entry. 715 * then remove the stale device entry.
701 */ 716 */
702 btrfs_free_stale_device(device); 717 if (ret > 0)
718 btrfs_free_stale_device(device);
703 719
704 *fs_devices_ret = fs_devices; 720 *fs_devices_ret = fs_devices;
705 721
@@ -988,6 +1004,56 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
988 return ret; 1004 return ret;
989} 1005}
990 1006
1007void btrfs_release_disk_super(struct page *page)
1008{
1009 kunmap(page);
1010 put_page(page);
1011}
1012
1013int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1014 struct page **page, struct btrfs_super_block **disk_super)
1015{
1016 void *p;
1017 pgoff_t index;
1018
1019 /* make sure our super fits in the device */
1020 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1021 return 1;
1022
1023 /* make sure our super fits in the page */
1024 if (sizeof(**disk_super) > PAGE_SIZE)
1025 return 1;
1026
1027 /* make sure our super doesn't straddle pages on disk */
1028 index = bytenr >> PAGE_SHIFT;
1029 if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1030 return 1;
1031
1032 /* pull in the page with our super */
1033 *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1034 index, GFP_KERNEL);
1035
1036 if (IS_ERR_OR_NULL(*page))
1037 return 1;
1038
1039 p = kmap(*page);
1040
1041 /* align our pointer to the offset of the super block */
1042 *disk_super = p + (bytenr & ~PAGE_MASK);
1043
1044 if (btrfs_super_bytenr(*disk_super) != bytenr ||
1045 btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1046 btrfs_release_disk_super(*page);
1047 return 1;
1048 }
1049
1050 if ((*disk_super)->label[0] &&
1051 (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1052 (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1053
1054 return 0;
1055}
1056
991/* 1057/*
992 * Look for a btrfs signature on a device. This may be called out of the mount path 1058 * Look for a btrfs signature on a device. This may be called out of the mount path
993 * and we are not allowed to call set_blocksize during the scan. The superblock 1059 * and we are not allowed to call set_blocksize during the scan. The superblock
@@ -999,13 +1065,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
999 struct btrfs_super_block *disk_super; 1065 struct btrfs_super_block *disk_super;
1000 struct block_device *bdev; 1066 struct block_device *bdev;
1001 struct page *page; 1067 struct page *page;
1002 void *p;
1003 int ret = -EINVAL; 1068 int ret = -EINVAL;
1004 u64 devid; 1069 u64 devid;
1005 u64 transid; 1070 u64 transid;
1006 u64 total_devices; 1071 u64 total_devices;
1007 u64 bytenr; 1072 u64 bytenr;
1008 pgoff_t index;
1009 1073
1010 /* 1074 /*
1011 * we would like to check all the supers, but that would make 1075 * we would like to check all the supers, but that would make
@@ -1018,41 +1082,14 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1018 mutex_lock(&uuid_mutex); 1082 mutex_lock(&uuid_mutex);
1019 1083
1020 bdev = blkdev_get_by_path(path, flags, holder); 1084 bdev = blkdev_get_by_path(path, flags, holder);
1021
1022 if (IS_ERR(bdev)) { 1085 if (IS_ERR(bdev)) {
1023 ret = PTR_ERR(bdev); 1086 ret = PTR_ERR(bdev);
1024 goto error; 1087 goto error;
1025 } 1088 }
1026 1089
1027 /* make sure our super fits in the device */ 1090 if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
1028 if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1029 goto error_bdev_put;
1030
1031 /* make sure our super fits in the page */
1032 if (sizeof(*disk_super) > PAGE_SIZE)
1033 goto error_bdev_put;
1034
1035 /* make sure our super doesn't straddle pages on disk */
1036 index = bytenr >> PAGE_SHIFT;
1037 if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1038 goto error_bdev_put;
1039
1040 /* pull in the page with our super */
1041 page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1042 index, GFP_NOFS);
1043
1044 if (IS_ERR_OR_NULL(page))
1045 goto error_bdev_put; 1091 goto error_bdev_put;
1046 1092
1047 p = kmap(page);
1048
1049 /* align our pointer to the offset of the super block */
1050 disk_super = p + (bytenr & ~PAGE_MASK);
1051
1052 if (btrfs_super_bytenr(disk_super) != bytenr ||
1053 btrfs_super_magic(disk_super) != BTRFS_MAGIC)
1054 goto error_unmap;
1055
1056 devid = btrfs_stack_device_id(&disk_super->dev_item); 1093 devid = btrfs_stack_device_id(&disk_super->dev_item);
1057 transid = btrfs_super_generation(disk_super); 1094 transid = btrfs_super_generation(disk_super);
1058 total_devices = btrfs_super_num_devices(disk_super); 1095 total_devices = btrfs_super_num_devices(disk_super);
@@ -1060,8 +1097,6 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1060 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 1097 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
1061 if (ret > 0) { 1098 if (ret > 0) {
1062 if (disk_super->label[0]) { 1099 if (disk_super->label[0]) {
1063 if (disk_super->label[BTRFS_LABEL_SIZE - 1])
1064 disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
1065 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label); 1100 printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
1066 } else { 1101 } else {
1067 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid); 1102 printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
@@ -1073,9 +1108,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1073 if (!ret && fs_devices_ret) 1108 if (!ret && fs_devices_ret)
1074 (*fs_devices_ret)->total_devices = total_devices; 1109 (*fs_devices_ret)->total_devices = total_devices;
1075 1110
1076error_unmap: 1111 btrfs_release_disk_super(page);
1077 kunmap(page);
1078 put_page(page);
1079 1112
1080error_bdev_put: 1113error_bdev_put:
1081 blkdev_put(bdev, flags); 1114 blkdev_put(bdev, flags);
@@ -1454,7 +1487,7 @@ again:
1454 extent = btrfs_item_ptr(leaf, path->slots[0], 1487 extent = btrfs_item_ptr(leaf, path->slots[0],
1455 struct btrfs_dev_extent); 1488 struct btrfs_dev_extent);
1456 } else { 1489 } else {
1457 btrfs_std_error(root->fs_info, ret, "Slot search failed"); 1490 btrfs_handle_fs_error(root->fs_info, ret, "Slot search failed");
1458 goto out; 1491 goto out;
1459 } 1492 }
1460 1493
@@ -1462,7 +1495,7 @@ again:
1462 1495
1463 ret = btrfs_del_item(trans, root, path); 1496 ret = btrfs_del_item(trans, root, path);
1464 if (ret) { 1497 if (ret) {
1465 btrfs_std_error(root->fs_info, ret, 1498 btrfs_handle_fs_error(root->fs_info, ret,
1466 "Failed to remove dev extent item"); 1499 "Failed to remove dev extent item");
1467 } else { 1500 } else {
1468 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags); 1501 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
@@ -1688,32 +1721,92 @@ out:
1688 return ret; 1721 return ret;
1689} 1722}
1690 1723
1691int btrfs_rm_device(struct btrfs_root *root, char *device_path) 1724/*
1725 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1726 * filesystem. It's up to the caller to adjust that number regarding eg. device
1727 * replace.
1728 */
1729static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1730 u64 num_devices)
1731{
1732 u64 all_avail;
1733 unsigned seq;
1734 int i;
1735
1736 do {
1737 seq = read_seqbegin(&fs_info->profiles_lock);
1738
1739 all_avail = fs_info->avail_data_alloc_bits |
1740 fs_info->avail_system_alloc_bits |
1741 fs_info->avail_metadata_alloc_bits;
1742 } while (read_seqretry(&fs_info->profiles_lock, seq));
1743
1744 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1745 if (!(all_avail & btrfs_raid_group[i]))
1746 continue;
1747
1748 if (num_devices < btrfs_raid_array[i].devs_min) {
1749 int ret = btrfs_raid_mindev_error[i];
1750
1751 if (ret)
1752 return ret;
1753 }
1754 }
1755
1756 return 0;
1757}
1758
1759struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
1760 struct btrfs_device *device)
1692{ 1761{
1693 struct btrfs_device *device;
1694 struct btrfs_device *next_device; 1762 struct btrfs_device *next_device;
1695 struct block_device *bdev; 1763
1696 struct buffer_head *bh = NULL; 1764 list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1697 struct btrfs_super_block *disk_super; 1765 if (next_device != device &&
1766 !next_device->missing && next_device->bdev)
1767 return next_device;
1768 }
1769
1770 return NULL;
1771}
1772
1773/*
1774 * Helper function to check if the given device is part of s_bdev / latest_bdev
1775 * and replace it with the provided or the next active device, in the context
1776 * where this function called, there should be always be another device (or
1777 * this_dev) which is active.
1778 */
1779void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1780 struct btrfs_device *device, struct btrfs_device *this_dev)
1781{
1782 struct btrfs_device *next_device;
1783
1784 if (this_dev)
1785 next_device = this_dev;
1786 else
1787 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1788 device);
1789 ASSERT(next_device);
1790
1791 if (fs_info->sb->s_bdev &&
1792 (fs_info->sb->s_bdev == device->bdev))
1793 fs_info->sb->s_bdev = next_device->bdev;
1794
1795 if (fs_info->fs_devices->latest_bdev == device->bdev)
1796 fs_info->fs_devices->latest_bdev = next_device->bdev;
1797}
1798
1799int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid)
1800{
1801 struct btrfs_device *device;
1698 struct btrfs_fs_devices *cur_devices; 1802 struct btrfs_fs_devices *cur_devices;
1699 u64 all_avail;
1700 u64 devid;
1701 u64 num_devices; 1803 u64 num_devices;
1702 u8 *dev_uuid;
1703 unsigned seq;
1704 int ret = 0; 1804 int ret = 0;
1705 bool clear_super = false; 1805 bool clear_super = false;
1806 char *dev_name = NULL;
1706 1807
1707 mutex_lock(&uuid_mutex); 1808 mutex_lock(&uuid_mutex);
1708 1809
1709 do {
1710 seq = read_seqbegin(&root->fs_info->profiles_lock);
1711
1712 all_avail = root->fs_info->avail_data_alloc_bits |
1713 root->fs_info->avail_system_alloc_bits |
1714 root->fs_info->avail_metadata_alloc_bits;
1715 } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1716
1717 num_devices = root->fs_info->fs_devices->num_devices; 1810 num_devices = root->fs_info->fs_devices->num_devices;
1718 btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0); 1811 btrfs_dev_replace_lock(&root->fs_info->dev_replace, 0);
1719 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) { 1812 if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
@@ -1722,78 +1815,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1722 } 1815 }
1723 btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0); 1816 btrfs_dev_replace_unlock(&root->fs_info->dev_replace, 0);
1724 1817
1725 if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) { 1818 ret = btrfs_check_raid_min_devices(root->fs_info, num_devices - 1);
1726 ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET; 1819 if (ret)
1727 goto out;
1728 }
1729
1730 if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1731 ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1732 goto out; 1820 goto out;
1733 }
1734 1821
1735 if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) && 1822 ret = btrfs_find_device_by_devspec(root, devid, device_path,
1736 root->fs_info->fs_devices->rw_devices <= 2) { 1823 &device);
1737 ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET; 1824 if (ret)
1738 goto out;
1739 }
1740 if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1741 root->fs_info->fs_devices->rw_devices <= 3) {
1742 ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1743 goto out; 1825 goto out;
1744 }
1745
1746 if (strcmp(device_path, "missing") == 0) {
1747 struct list_head *devices;
1748 struct btrfs_device *tmp;
1749
1750 device = NULL;
1751 devices = &root->fs_info->fs_devices->devices;
1752 /*
1753 * It is safe to read the devices since the volume_mutex
1754 * is held.
1755 */
1756 list_for_each_entry(tmp, devices, dev_list) {
1757 if (tmp->in_fs_metadata &&
1758 !tmp->is_tgtdev_for_dev_replace &&
1759 !tmp->bdev) {
1760 device = tmp;
1761 break;
1762 }
1763 }
1764 bdev = NULL;
1765 bh = NULL;
1766 disk_super = NULL;
1767 if (!device) {
1768 ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1769 goto out;
1770 }
1771 } else {
1772 ret = btrfs_get_bdev_and_sb(device_path,
1773 FMODE_WRITE | FMODE_EXCL,
1774 root->fs_info->bdev_holder, 0,
1775 &bdev, &bh);
1776 if (ret)
1777 goto out;
1778 disk_super = (struct btrfs_super_block *)bh->b_data;
1779 devid = btrfs_stack_device_id(&disk_super->dev_item);
1780 dev_uuid = disk_super->dev_item.uuid;
1781 device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1782 disk_super->fsid);
1783 if (!device) {
1784 ret = -ENOENT;
1785 goto error_brelse;
1786 }
1787 }
1788 1826
1789 if (device->is_tgtdev_for_dev_replace) { 1827 if (device->is_tgtdev_for_dev_replace) {
1790 ret = BTRFS_ERROR_DEV_TGT_REPLACE; 1828 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1791 goto error_brelse; 1829 goto out;
1792 } 1830 }
1793 1831
1794 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { 1832 if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1795 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE; 1833 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1796 goto error_brelse; 1834 goto out;
1797 } 1835 }
1798 1836
1799 if (device->writeable) { 1837 if (device->writeable) {
@@ -1801,6 +1839,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1801 list_del_init(&device->dev_alloc_list); 1839 list_del_init(&device->dev_alloc_list);
1802 device->fs_devices->rw_devices--; 1840 device->fs_devices->rw_devices--;
1803 unlock_chunks(root); 1841 unlock_chunks(root);
1842 dev_name = kstrdup(device->name->str, GFP_KERNEL);
1843 if (!dev_name) {
1844 ret = -ENOMEM;
1845 goto error_undo;
1846 }
1804 clear_super = true; 1847 clear_super = true;
1805 } 1848 }
1806 1849
@@ -1842,12 +1885,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1842 if (device->missing) 1885 if (device->missing)
1843 device->fs_devices->missing_devices--; 1886 device->fs_devices->missing_devices--;
1844 1887
1845 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1888 btrfs_assign_next_active_device(root->fs_info, device, NULL);
1846 struct btrfs_device, dev_list);
1847 if (device->bdev == root->fs_info->sb->s_bdev)
1848 root->fs_info->sb->s_bdev = next_device->bdev;
1849 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1850 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1851 1889
1852 if (device->bdev) { 1890 if (device->bdev) {
1853 device->fs_devices->open_devices--; 1891 device->fs_devices->open_devices--;
@@ -1883,63 +1921,23 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1883 * at this point, the device is zero sized. We want to 1921 * at this point, the device is zero sized. We want to
1884 * remove it from the devices list and zero out the old super 1922 * remove it from the devices list and zero out the old super
1885 */ 1923 */
1886 if (clear_super && disk_super) { 1924 if (clear_super) {
1887 u64 bytenr; 1925 struct block_device *bdev;
1888 int i; 1926
1889 1927 bdev = blkdev_get_by_path(dev_name, FMODE_READ | FMODE_EXCL,
1890 /* make sure this device isn't detected as part of 1928 root->fs_info->bdev_holder);
1891 * the FS anymore 1929 if (!IS_ERR(bdev)) {
1892 */ 1930 btrfs_scratch_superblocks(bdev, dev_name);
1893 memset(&disk_super->magic, 0, sizeof(disk_super->magic)); 1931 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1894 set_buffer_dirty(bh);
1895 sync_dirty_buffer(bh);
1896
1897 /* clear the mirror copies of super block on the disk
1898 * being removed, 0th copy is been taken care above and
1899 * the below would take of the rest
1900 */
1901 for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1902 bytenr = btrfs_sb_offset(i);
1903 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1904 i_size_read(bdev->bd_inode))
1905 break;
1906
1907 brelse(bh);
1908 bh = __bread(bdev, bytenr / 4096,
1909 BTRFS_SUPER_INFO_SIZE);
1910 if (!bh)
1911 continue;
1912
1913 disk_super = (struct btrfs_super_block *)bh->b_data;
1914
1915 if (btrfs_super_bytenr(disk_super) != bytenr ||
1916 btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1917 continue;
1918 }
1919 memset(&disk_super->magic, 0,
1920 sizeof(disk_super->magic));
1921 set_buffer_dirty(bh);
1922 sync_dirty_buffer(bh);
1923 } 1932 }
1924 } 1933 }
1925 1934
1926 ret = 0;
1927
1928 if (bdev) {
1929 /* Notify udev that device has changed */
1930 btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1931
1932 /* Update ctime/mtime for device path for libblkid */
1933 update_dev_time(device_path);
1934 }
1935
1936error_brelse:
1937 brelse(bh);
1938 if (bdev)
1939 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1940out: 1935out:
1936 kfree(dev_name);
1937
1941 mutex_unlock(&uuid_mutex); 1938 mutex_unlock(&uuid_mutex);
1942 return ret; 1939 return ret;
1940
1943error_undo: 1941error_undo:
1944 if (device->writeable) { 1942 if (device->writeable) {
1945 lock_chunks(root); 1943 lock_chunks(root);
@@ -1948,7 +1946,7 @@ error_undo:
1948 device->fs_devices->rw_devices++; 1946 device->fs_devices->rw_devices++;
1949 unlock_chunks(root); 1947 unlock_chunks(root);
1950 } 1948 }
1951 goto error_brelse; 1949 goto out;
1952} 1950}
1953 1951
1954void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info, 1952void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
@@ -1972,11 +1970,8 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1972 if (srcdev->missing) 1970 if (srcdev->missing)
1973 fs_devices->missing_devices--; 1971 fs_devices->missing_devices--;
1974 1972
1975 if (srcdev->writeable) { 1973 if (srcdev->writeable)
1976 fs_devices->rw_devices--; 1974 fs_devices->rw_devices--;
1977 /* zero out the old super if it is writable */
1978 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
1979 }
1980 1975
1981 if (srcdev->bdev) 1976 if (srcdev->bdev)
1982 fs_devices->open_devices--; 1977 fs_devices->open_devices--;
@@ -1987,6 +1982,10 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
1987{ 1982{
1988 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; 1983 struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
1989 1984
1985 if (srcdev->writeable) {
1986 /* zero out the old super if it is writable */
1987 btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
1988 }
1990 call_rcu(&srcdev->rcu, free_device); 1989 call_rcu(&srcdev->rcu, free_device);
1991 1990
1992 /* 1991 /*
@@ -2016,32 +2015,33 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2016void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, 2015void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2017 struct btrfs_device *tgtdev) 2016 struct btrfs_device *tgtdev)
2018{ 2017{
2019 struct btrfs_device *next_device;
2020
2021 mutex_lock(&uuid_mutex); 2018 mutex_lock(&uuid_mutex);
2022 WARN_ON(!tgtdev); 2019 WARN_ON(!tgtdev);
2023 mutex_lock(&fs_info->fs_devices->device_list_mutex); 2020 mutex_lock(&fs_info->fs_devices->device_list_mutex);
2024 2021
2025 btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev); 2022 btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2026 2023
2027 if (tgtdev->bdev) { 2024 if (tgtdev->bdev)
2028 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2029 fs_info->fs_devices->open_devices--; 2025 fs_info->fs_devices->open_devices--;
2030 } 2026
2031 fs_info->fs_devices->num_devices--; 2027 fs_info->fs_devices->num_devices--;
2032 2028
2033 next_device = list_entry(fs_info->fs_devices->devices.next, 2029 btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2034 struct btrfs_device, dev_list);
2035 if (tgtdev->bdev == fs_info->sb->s_bdev)
2036 fs_info->sb->s_bdev = next_device->bdev;
2037 if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
2038 fs_info->fs_devices->latest_bdev = next_device->bdev;
2039 list_del_rcu(&tgtdev->dev_list);
2040 2030
2041 call_rcu(&tgtdev->rcu, free_device); 2031 list_del_rcu(&tgtdev->dev_list);
2042 2032
2043 mutex_unlock(&fs_info->fs_devices->device_list_mutex); 2033 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2044 mutex_unlock(&uuid_mutex); 2034 mutex_unlock(&uuid_mutex);
2035
2036 /*
2037 * The update_dev_time() with in btrfs_scratch_superblocks()
2038 * may lead to a call to btrfs_show_devname() which will try
2039 * to hold device_list_mutex. And here this device
2040 * is already out of device list, so we don't have to hold
2041 * the device_list_mutex lock.
2042 */
2043 btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2044 call_rcu(&tgtdev->rcu, free_device);
2045} 2045}
2046 2046
2047static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, 2047static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
@@ -2102,6 +2102,31 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
2102} 2102}
2103 2103
2104/* 2104/*
2105 * Lookup a device given by device id, or the path if the id is 0.
2106 */
2107int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
2108 char *devpath,
2109 struct btrfs_device **device)
2110{
2111 int ret;
2112
2113 if (devid) {
2114 ret = 0;
2115 *device = btrfs_find_device(root->fs_info, devid, NULL,
2116 NULL);
2117 if (!*device)
2118 ret = -ENOENT;
2119 } else {
2120 if (!devpath || !devpath[0])
2121 return -EINVAL;
2122
2123 ret = btrfs_find_device_missing_or_by_path(root, devpath,
2124 device);
2125 }
2126 return ret;
2127}
2128
2129/*
2105 * does all the dirty work required for changing file system's UUID. 2130 * does all the dirty work required for changing file system's UUID.
2106 */ 2131 */
2107static int btrfs_prepare_sprout(struct btrfs_root *root) 2132static int btrfs_prepare_sprout(struct btrfs_root *root)
@@ -2418,7 +2443,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2418 2443
2419 ret = btrfs_relocate_sys_chunks(root); 2444 ret = btrfs_relocate_sys_chunks(root);
2420 if (ret < 0) 2445 if (ret < 0)
2421 btrfs_std_error(root->fs_info, ret, 2446 btrfs_handle_fs_error(root->fs_info, ret,
2422 "Failed to relocate sys chunks after " 2447 "Failed to relocate sys chunks after "
2423 "device initialization. This can be fixed " 2448 "device initialization. This can be fixed "
2424 "using the \"btrfs balance\" command."); 2449 "using the \"btrfs balance\" command.");
@@ -2663,7 +2688,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2663 if (ret < 0) 2688 if (ret < 0)
2664 goto out; 2689 goto out;
2665 else if (ret > 0) { /* Logic error or corruption */ 2690 else if (ret > 0) { /* Logic error or corruption */
2666 btrfs_std_error(root->fs_info, -ENOENT, 2691 btrfs_handle_fs_error(root->fs_info, -ENOENT,
2667 "Failed lookup while freeing chunk."); 2692 "Failed lookup while freeing chunk.");
2668 ret = -ENOENT; 2693 ret = -ENOENT;
2669 goto out; 2694 goto out;
@@ -2671,7 +2696,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2671 2696
2672 ret = btrfs_del_item(trans, root, path); 2697 ret = btrfs_del_item(trans, root, path);
2673 if (ret < 0) 2698 if (ret < 0)
2674 btrfs_std_error(root->fs_info, ret, 2699 btrfs_handle_fs_error(root->fs_info, ret,
2675 "Failed to delete chunk item."); 2700 "Failed to delete chunk item.");
2676out: 2701out:
2677 btrfs_free_path(path); 2702 btrfs_free_path(path);
@@ -2857,7 +2882,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, u64 chunk_offset)
2857 chunk_offset); 2882 chunk_offset);
2858 if (IS_ERR(trans)) { 2883 if (IS_ERR(trans)) {
2859 ret = PTR_ERR(trans); 2884 ret = PTR_ERR(trans);
2860 btrfs_std_error(root->fs_info, ret, NULL); 2885 btrfs_handle_fs_error(root->fs_info, ret, NULL);
2861 return ret; 2886 return ret;
2862 } 2887 }
2863 2888
@@ -3402,6 +3427,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3402 u32 count_meta = 0; 3427 u32 count_meta = 0;
3403 u32 count_sys = 0; 3428 u32 count_sys = 0;
3404 int chunk_reserved = 0; 3429 int chunk_reserved = 0;
3430 u64 bytes_used = 0;
3405 3431
3406 /* step one make some room on all the devices */ 3432 /* step one make some room on all the devices */
3407 devices = &fs_info->fs_devices->devices; 3433 devices = &fs_info->fs_devices->devices;
@@ -3540,7 +3566,13 @@ again:
3540 goto loop; 3566 goto loop;
3541 } 3567 }
3542 3568
3543 if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) && !chunk_reserved) { 3569 ASSERT(fs_info->data_sinfo);
3570 spin_lock(&fs_info->data_sinfo->lock);
3571 bytes_used = fs_info->data_sinfo->bytes_used;
3572 spin_unlock(&fs_info->data_sinfo->lock);
3573
3574 if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3575 !chunk_reserved && !bytes_used) {
3544 trans = btrfs_start_transaction(chunk_root, 0); 3576 trans = btrfs_start_transaction(chunk_root, 0);
3545 if (IS_ERR(trans)) { 3577 if (IS_ERR(trans)) {
3546 mutex_unlock(&fs_info->delete_unused_bgs_mutex); 3578 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
@@ -3632,7 +3664,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
3632 unset_balance_control(fs_info); 3664 unset_balance_control(fs_info);
3633 ret = del_balance_item(fs_info->tree_root); 3665 ret = del_balance_item(fs_info->tree_root);
3634 if (ret) 3666 if (ret)
3635 btrfs_std_error(fs_info, ret, NULL); 3667 btrfs_handle_fs_error(fs_info, ret, NULL);
3636 3668
3637 atomic_set(&fs_info->mutually_exclusive_operation_running, 0); 3669 atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3638} 3670}
@@ -3693,10 +3725,8 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
3693 num_devices--; 3725 num_devices--;
3694 } 3726 }
3695 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); 3727 btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3696 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE; 3728 allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
3697 if (num_devices == 1) 3729 if (num_devices > 1)
3698 allowed |= BTRFS_BLOCK_GROUP_DUP;
3699 else if (num_devices > 1)
3700 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1); 3730 allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3701 if (num_devices > 2) 3731 if (num_devices > 2)
3702 allowed |= BTRFS_BLOCK_GROUP_RAID5; 3732 allowed |= BTRFS_BLOCK_GROUP_RAID5;
@@ -5278,7 +5308,15 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5278 stripe_nr = div64_u64(stripe_nr, stripe_len); 5308 stripe_nr = div64_u64(stripe_nr, stripe_len);
5279 5309
5280 stripe_offset = stripe_nr * stripe_len; 5310 stripe_offset = stripe_nr * stripe_len;
5281 BUG_ON(offset < stripe_offset); 5311 if (offset < stripe_offset) {
5312 btrfs_crit(fs_info, "stripe math has gone wrong, "
5313 "stripe_offset=%llu, offset=%llu, start=%llu, "
5314 "logical=%llu, stripe_len=%llu",
5315 stripe_offset, offset, em->start, logical,
5316 stripe_len);
5317 free_extent_map(em);
5318 return -EINVAL;
5319 }
5282 5320
5283 /* stripe_offset is the offset of this block in its stripe*/ 5321 /* stripe_offset is the offset of this block in its stripe*/
5284 stripe_offset = offset - stripe_offset; 5322 stripe_offset = offset - stripe_offset;
@@ -5519,7 +5557,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5519 &stripe_index); 5557 &stripe_index);
5520 mirror_num = stripe_index + 1; 5558 mirror_num = stripe_index + 1;
5521 } 5559 }
5522 BUG_ON(stripe_index >= map->num_stripes); 5560 if (stripe_index >= map->num_stripes) {
5561 btrfs_crit(fs_info, "stripe index math went horribly wrong, "
5562 "got stripe_index=%u, num_stripes=%u",
5563 stripe_index, map->num_stripes);
5564 ret = -EINVAL;
5565 goto out;
5566 }
5523 5567
5524 num_alloc_stripes = num_stripes; 5568 num_alloc_stripes = num_stripes;
5525 if (dev_replace_is_ongoing) { 5569 if (dev_replace_is_ongoing) {
@@ -6242,7 +6286,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
6242 "invalid chunk length %llu", length); 6286 "invalid chunk length %llu", length);
6243 return -EIO; 6287 return -EIO;
6244 } 6288 }
6245 if (!is_power_of_2(stripe_len)) { 6289 if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
6246 btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", 6290 btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
6247 stripe_len); 6291 stripe_len);
6248 return -EIO; 6292 return -EIO;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 1939ebde63df..0ac90f8d85bd 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -340,14 +340,14 @@ struct btrfs_raid_attr {
340}; 340};
341 341
342extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; 342extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
343 343extern const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES];
344extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES]; 344extern const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES];
345 345
346struct map_lookup { 346struct map_lookup {
347 u64 type; 347 u64 type;
348 int io_align; 348 int io_align;
349 int io_width; 349 int io_width;
350 int stripe_len; 350 u64 stripe_len;
351 int sector_size; 351 int sector_size;
352 int num_stripes; 352 int num_stripes;
353 int sub_stripes; 353 int sub_stripes;
@@ -357,52 +357,6 @@ struct map_lookup {
357#define map_lookup_size(n) (sizeof(struct map_lookup) + \ 357#define map_lookup_size(n) (sizeof(struct map_lookup) + \
358 (sizeof(struct btrfs_bio_stripe) * (n))) 358 (sizeof(struct btrfs_bio_stripe) * (n)))
359 359
360/*
361 * Restriper's general type filter
362 */
363#define BTRFS_BALANCE_DATA (1ULL << 0)
364#define BTRFS_BALANCE_SYSTEM (1ULL << 1)
365#define BTRFS_BALANCE_METADATA (1ULL << 2)
366
367#define BTRFS_BALANCE_TYPE_MASK (BTRFS_BALANCE_DATA | \
368 BTRFS_BALANCE_SYSTEM | \
369 BTRFS_BALANCE_METADATA)
370
371#define BTRFS_BALANCE_FORCE (1ULL << 3)
372#define BTRFS_BALANCE_RESUME (1ULL << 4)
373
374/*
375 * Balance filters
376 */
377#define BTRFS_BALANCE_ARGS_PROFILES (1ULL << 0)
378#define BTRFS_BALANCE_ARGS_USAGE (1ULL << 1)
379#define BTRFS_BALANCE_ARGS_DEVID (1ULL << 2)
380#define BTRFS_BALANCE_ARGS_DRANGE (1ULL << 3)
381#define BTRFS_BALANCE_ARGS_VRANGE (1ULL << 4)
382#define BTRFS_BALANCE_ARGS_LIMIT (1ULL << 5)
383#define BTRFS_BALANCE_ARGS_LIMIT_RANGE (1ULL << 6)
384#define BTRFS_BALANCE_ARGS_STRIPES_RANGE (1ULL << 7)
385#define BTRFS_BALANCE_ARGS_USAGE_RANGE (1ULL << 10)
386
387#define BTRFS_BALANCE_ARGS_MASK \
388 (BTRFS_BALANCE_ARGS_PROFILES | \
389 BTRFS_BALANCE_ARGS_USAGE | \
390 BTRFS_BALANCE_ARGS_DEVID | \
391 BTRFS_BALANCE_ARGS_DRANGE | \
392 BTRFS_BALANCE_ARGS_VRANGE | \
393 BTRFS_BALANCE_ARGS_LIMIT | \
394 BTRFS_BALANCE_ARGS_LIMIT_RANGE | \
395 BTRFS_BALANCE_ARGS_STRIPES_RANGE | \
396 BTRFS_BALANCE_ARGS_USAGE_RANGE)
397
398/*
399 * Profile changing flags. When SOFT is set we won't relocate chunk if
400 * it already has the target profile (even though it may be
401 * half-filled).
402 */
403#define BTRFS_BALANCE_ARGS_CONVERT (1ULL << 8)
404#define BTRFS_BALANCE_ARGS_SOFT (1ULL << 9)
405
406struct btrfs_balance_args; 360struct btrfs_balance_args;
407struct btrfs_balance_progress; 361struct btrfs_balance_progress;
408struct btrfs_balance_control { 362struct btrfs_balance_control {
@@ -445,13 +399,18 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
445 struct btrfs_fs_devices **fs_devices_ret); 399 struct btrfs_fs_devices **fs_devices_ret);
446int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); 400int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
447void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step); 401void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step);
402void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
403 struct btrfs_device *device, struct btrfs_device *this_dev);
448int btrfs_find_device_missing_or_by_path(struct btrfs_root *root, 404int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
449 char *device_path, 405 char *device_path,
450 struct btrfs_device **device); 406 struct btrfs_device **device);
407int btrfs_find_device_by_devspec(struct btrfs_root *root, u64 devid,
408 char *devpath,
409 struct btrfs_device **device);
451struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, 410struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
452 const u64 *devid, 411 const u64 *devid,
453 const u8 *uuid); 412 const u8 *uuid);
454int btrfs_rm_device(struct btrfs_root *root, char *device_path); 413int btrfs_rm_device(struct btrfs_root *root, char *device_path, u64 devid);
455void btrfs_cleanup_fs_uuids(void); 414void btrfs_cleanup_fs_uuids(void);
456int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len); 415int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
457int btrfs_grow_device(struct btrfs_trans_handle *trans, 416int btrfs_grow_device(struct btrfs_trans_handle *trans,