aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/acl.c87
-rw-r--r--fs/btrfs/btrfs_inode.h22
-rw-r--r--fs/btrfs/compression.c14
-rw-r--r--fs/btrfs/ctree.c457
-rw-r--r--fs/btrfs/ctree.h63
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/delayed-inode.h2
-rw-r--r--fs/btrfs/dir-item.c39
-rw-r--r--fs/btrfs/disk-io.c131
-rw-r--r--fs/btrfs/disk-io.h10
-rw-r--r--fs/btrfs/extent-tree.c401
-rw-r--r--fs/btrfs/extent_io.c309
-rw-r--r--fs/btrfs/extent_io.h55
-rw-r--r--fs/btrfs/extent_map.c155
-rw-r--r--fs/btrfs/file-item.c50
-rw-r--r--fs/btrfs/file.c262
-rw-r--r--fs/btrfs/free-space-cache.c193
-rw-r--r--fs/btrfs/inode.c296
-rw-r--r--fs/btrfs/ioctl.c74
-rw-r--r--fs/btrfs/locking.c280
-rw-r--r--fs/btrfs/locking.h36
-rw-r--r--fs/btrfs/ref-cache.c68
-rw-r--r--fs/btrfs/ref-cache.h52
-rw-r--r--fs/btrfs/relocation.c3
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/struct-funcs.c100
-rw-r--r--fs/btrfs/transaction.c116
-rw-r--r--fs/btrfs/tree-log.c46
-rw-r--r--fs/btrfs/volumes.c65
-rw-r--r--fs/btrfs/volumes.h2
-rw-r--r--fs/btrfs/xattr.c73
32 files changed, 1765 insertions, 1707 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9b72dcf1cd2..40e6ac08c21 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
11
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index f66fc995973..eb159aaa5a1 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -28,9 +28,7 @@
28#include "btrfs_inode.h" 28#include "btrfs_inode.h"
29#include "xattr.h" 29#include "xattr.h"
30 30
31#ifdef CONFIG_BTRFS_FS_POSIX_ACL 31struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
32
33static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
34{ 32{
35 int size; 33 int size;
36 const char *name; 34 const char *name;
@@ -111,7 +109,6 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
111 int ret, size = 0; 109 int ret, size = 0;
112 const char *name; 110 const char *name;
113 char *value = NULL; 111 char *value = NULL;
114 mode_t mode;
115 112
116 if (acl) { 113 if (acl) {
117 ret = posix_acl_valid(acl); 114 ret = posix_acl_valid(acl);
@@ -122,13 +119,11 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
122 119
123 switch (type) { 120 switch (type) {
124 case ACL_TYPE_ACCESS: 121 case ACL_TYPE_ACCESS:
125 mode = inode->i_mode;
126 name = POSIX_ACL_XATTR_ACCESS; 122 name = POSIX_ACL_XATTR_ACCESS;
127 if (acl) { 123 if (acl) {
128 ret = posix_acl_equiv_mode(acl, &mode); 124 ret = posix_acl_equiv_mode(acl, &inode->i_mode);
129 if (ret < 0) 125 if (ret < 0)
130 return ret; 126 return ret;
131 inode->i_mode = mode;
132 } 127 }
133 ret = 0; 128 ret = 0;
134 break; 129 break;
@@ -195,28 +190,6 @@ out:
195 return ret; 190 return ret;
196} 191}
197 192
198int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
199{
200 int error = -EAGAIN;
201
202 if (flags & IPERM_FLAG_RCU) {
203 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
204 error = -ECHILD;
205
206 } else {
207 struct posix_acl *acl;
208 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
209 if (IS_ERR(acl))
210 return PTR_ERR(acl);
211 if (acl) {
212 error = posix_acl_permission(inode, acl, mask);
213 posix_acl_release(acl);
214 }
215 }
216
217 return error;
218}
219
220/* 193/*
221 * btrfs_init_acl is already generally called under fs_mutex, so the locking 194 * btrfs_init_acl is already generally called under fs_mutex, so the locking
222 * stuff has been fixed to work with that. If the locking stuff changes, we 195 * stuff has been fixed to work with that. If the locking stuff changes, we
@@ -244,31 +217,20 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
244 } 217 }
245 218
246 if (IS_POSIXACL(dir) && acl) { 219 if (IS_POSIXACL(dir) && acl) {
247 struct posix_acl *clone;
248 mode_t mode;
249
250 if (S_ISDIR(inode->i_mode)) { 220 if (S_ISDIR(inode->i_mode)) {
251 ret = btrfs_set_acl(trans, inode, acl, 221 ret = btrfs_set_acl(trans, inode, acl,
252 ACL_TYPE_DEFAULT); 222 ACL_TYPE_DEFAULT);
253 if (ret) 223 if (ret)
254 goto failed; 224 goto failed;
255 } 225 }
256 clone = posix_acl_clone(acl, GFP_NOFS); 226 ret = posix_acl_create(&acl, GFP_NOFS, &inode->i_mode);
257 ret = -ENOMEM; 227 if (ret < 0)
258 if (!clone) 228 return ret;
259 goto failed; 229
260 230 if (ret > 0) {
261 mode = inode->i_mode; 231 /* we need an acl */
262 ret = posix_acl_create_masq(clone, &mode); 232 ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
263 if (ret >= 0) {
264 inode->i_mode = mode;
265 if (ret > 0) {
266 /* we need an acl */
267 ret = btrfs_set_acl(trans, inode, clone,
268 ACL_TYPE_ACCESS);
269 }
270 } 233 }
271 posix_acl_release(clone);
272 } 234 }
273failed: 235failed:
274 posix_acl_release(acl); 236 posix_acl_release(acl);
@@ -278,7 +240,7 @@ failed:
278 240
279int btrfs_acl_chmod(struct inode *inode) 241int btrfs_acl_chmod(struct inode *inode)
280{ 242{
281 struct posix_acl *acl, *clone; 243 struct posix_acl *acl;
282 int ret = 0; 244 int ret = 0;
283 245
284 if (S_ISLNK(inode->i_mode)) 246 if (S_ISLNK(inode->i_mode))
@@ -291,17 +253,11 @@ int btrfs_acl_chmod(struct inode *inode)
291 if (IS_ERR_OR_NULL(acl)) 253 if (IS_ERR_OR_NULL(acl))
292 return PTR_ERR(acl); 254 return PTR_ERR(acl);
293 255
294 clone = posix_acl_clone(acl, GFP_KERNEL); 256 ret = posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
257 if (ret)
258 return ret;
259 ret = btrfs_set_acl(NULL, inode, acl, ACL_TYPE_ACCESS);
295 posix_acl_release(acl); 260 posix_acl_release(acl);
296 if (!clone)
297 return -ENOMEM;
298
299 ret = posix_acl_chmod_masq(clone, inode->i_mode);
300 if (!ret)
301 ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS);
302
303 posix_acl_release(clone);
304
305 return ret; 261 return ret;
306} 262}
307 263
@@ -318,18 +274,3 @@ const struct xattr_handler btrfs_xattr_acl_access_handler = {
318 .get = btrfs_xattr_acl_get, 274 .get = btrfs_xattr_acl_get,
319 .set = btrfs_xattr_acl_set, 275 .set = btrfs_xattr_acl_set,
320}; 276};
321
322#else /* CONFIG_BTRFS_FS_POSIX_ACL */
323
324int btrfs_acl_chmod(struct inode *inode)
325{
326 return 0;
327}
328
329int btrfs_init_acl(struct btrfs_trans_handle *trans,
330 struct inode *inode, struct inode *dir)
331{
332 return 0;
333}
334
335#endif /* CONFIG_BTRFS_FS_POSIX_ACL */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 52d7eca8c7b..d9f99a16edd 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -34,6 +34,9 @@ struct btrfs_inode {
34 */ 34 */
35 struct btrfs_key location; 35 struct btrfs_key location;
36 36
37 /* Lock for counters */
38 spinlock_t lock;
39
37 /* the extent_tree has caches of all the extent mappings to disk */ 40 /* the extent_tree has caches of all the extent mappings to disk */
38 struct extent_map_tree extent_tree; 41 struct extent_map_tree extent_tree;
39 42
@@ -134,8 +137,8 @@ struct btrfs_inode {
134 * items we think we'll end up using, and reserved_extents is the number 137 * items we think we'll end up using, and reserved_extents is the number
135 * of extent items we've reserved metadata for. 138 * of extent items we've reserved metadata for.
136 */ 139 */
137 atomic_t outstanding_extents; 140 unsigned outstanding_extents;
138 atomic_t reserved_extents; 141 unsigned reserved_extents;
139 142
140 /* 143 /*
141 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -173,7 +176,11 @@ static inline u64 btrfs_ino(struct inode *inode)
173{ 176{
174 u64 ino = BTRFS_I(inode)->location.objectid; 177 u64 ino = BTRFS_I(inode)->location.objectid;
175 178
176 if (ino <= BTRFS_FIRST_FREE_OBJECTID) 179 /*
180 * !ino: btree_inode
181 * type == BTRFS_ROOT_ITEM_KEY: subvol dir
182 */
183 if (!ino || BTRFS_I(inode)->location.type == BTRFS_ROOT_ITEM_KEY)
177 ino = inode->i_ino; 184 ino = inode->i_ino;
178 return ino; 185 return ino;
179} 186}
@@ -184,4 +191,13 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
184 BTRFS_I(inode)->disk_i_size = size; 191 BTRFS_I(inode)->disk_i_size = size;
185} 192}
186 193
194static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
195 struct inode *inode)
196{
197 if (root == root->fs_info->tree_root ||
198 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
199 return true;
200 return false;
201}
202
187#endif 203#endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index bfe42b03eaf..8ec5d86f173 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -338,6 +338,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
338 u64 first_byte = disk_start; 338 u64 first_byte = disk_start;
339 struct block_device *bdev; 339 struct block_device *bdev;
340 int ret; 340 int ret;
341 int skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
341 342
342 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); 343 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
343 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 344 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -392,8 +393,11 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
392 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 393 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
393 BUG_ON(ret); 394 BUG_ON(ret);
394 395
395 ret = btrfs_csum_one_bio(root, inode, bio, start, 1); 396 if (!skip_sum) {
396 BUG_ON(ret); 397 ret = btrfs_csum_one_bio(root, inode, bio,
398 start, 1);
399 BUG_ON(ret);
400 }
397 401
398 ret = btrfs_map_bio(root, WRITE, bio, 0, 1); 402 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
399 BUG_ON(ret); 403 BUG_ON(ret);
@@ -418,8 +422,10 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
418 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 422 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
419 BUG_ON(ret); 423 BUG_ON(ret);
420 424
421 ret = btrfs_csum_one_bio(root, inode, bio, start, 1); 425 if (!skip_sum) {
422 BUG_ON(ret); 426 ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
427 BUG_ON(ret);
428 }
423 429
424 ret = btrfs_map_bio(root, WRITE, bio, 0, 1); 430 ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
425 BUG_ON(ret); 431 BUG_ON(ret);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2e667868e0d..011cab3aca8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -54,8 +54,13 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
54{ 54{
55 int i; 55 int i;
56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) { 56 for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
57 if (p->nodes[i] && p->locks[i]) 57 if (!p->nodes[i] || !p->locks[i])
58 btrfs_set_lock_blocking(p->nodes[i]); 58 continue;
59 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
60 if (p->locks[i] == BTRFS_READ_LOCK)
61 p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
62 else if (p->locks[i] == BTRFS_WRITE_LOCK)
63 p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
59 } 64 }
60} 65}
61 66
@@ -68,7 +73,7 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
68 * for held 73 * for held
69 */ 74 */
70noinline void btrfs_clear_path_blocking(struct btrfs_path *p, 75noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
71 struct extent_buffer *held) 76 struct extent_buffer *held, int held_rw)
72{ 77{
73 int i; 78 int i;
74 79
@@ -79,19 +84,29 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79 * really sure by forcing the path to blocking before we clear 84 * really sure by forcing the path to blocking before we clear
80 * the path blocking. 85 * the path blocking.
81 */ 86 */
82 if (held) 87 if (held) {
83 btrfs_set_lock_blocking(held); 88 btrfs_set_lock_blocking_rw(held, held_rw);
89 if (held_rw == BTRFS_WRITE_LOCK)
90 held_rw = BTRFS_WRITE_LOCK_BLOCKING;
91 else if (held_rw == BTRFS_READ_LOCK)
92 held_rw = BTRFS_READ_LOCK_BLOCKING;
93 }
84 btrfs_set_path_blocking(p); 94 btrfs_set_path_blocking(p);
85#endif 95#endif
86 96
87 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { 97 for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
88 if (p->nodes[i] && p->locks[i]) 98 if (p->nodes[i] && p->locks[i]) {
89 btrfs_clear_lock_blocking(p->nodes[i]); 99 btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
100 if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
101 p->locks[i] = BTRFS_WRITE_LOCK;
102 else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
103 p->locks[i] = BTRFS_READ_LOCK;
104 }
90 } 105 }
91 106
92#ifdef CONFIG_DEBUG_LOCK_ALLOC 107#ifdef CONFIG_DEBUG_LOCK_ALLOC
93 if (held) 108 if (held)
94 btrfs_clear_lock_blocking(held); 109 btrfs_clear_lock_blocking_rw(held, held_rw);
95#endif 110#endif
96} 111}
97 112
@@ -119,7 +134,7 @@ noinline void btrfs_release_path(struct btrfs_path *p)
119 if (!p->nodes[i]) 134 if (!p->nodes[i])
120 continue; 135 continue;
121 if (p->locks[i]) { 136 if (p->locks[i]) {
122 btrfs_tree_unlock(p->nodes[i]); 137 btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
123 p->locks[i] = 0; 138 p->locks[i] = 0;
124 } 139 }
125 free_extent_buffer(p->nodes[i]); 140 free_extent_buffer(p->nodes[i]);
@@ -167,6 +182,25 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
167 return eb; 182 return eb;
168} 183}
169 184
185/* loop around taking references on and locking the root node of the
186 * tree until you end up with a lock on the root. A locked buffer
187 * is returned, with a reference held.
188 */
189struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
190{
191 struct extent_buffer *eb;
192
193 while (1) {
194 eb = btrfs_root_node(root);
195 btrfs_tree_read_lock(eb);
196 if (eb == root->node)
197 break;
198 btrfs_tree_read_unlock(eb);
199 free_extent_buffer(eb);
200 }
201 return eb;
202}
203
170/* cowonly root (everything not a reference counted cow subvolume), just get 204/* cowonly root (everything not a reference counted cow subvolume), just get
171 * put onto a simple dirty list. transaction.c walks this to make sure they 205 * put onto a simple dirty list. transaction.c walks this to make sure they
172 * get properly updated on disk. 206 * get properly updated on disk.
@@ -626,14 +660,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
626 for (i = start_slot; i < end_slot; i++) { 660 for (i = start_slot; i < end_slot; i++) {
627 int close = 1; 661 int close = 1;
628 662
629 if (!parent->map_token) {
630 map_extent_buffer(parent,
631 btrfs_node_key_ptr_offset(i),
632 sizeof(struct btrfs_key_ptr),
633 &parent->map_token, &parent->kaddr,
634 &parent->map_start, &parent->map_len,
635 KM_USER1);
636 }
637 btrfs_node_key(parent, &disk_key, i); 663 btrfs_node_key(parent, &disk_key, i);
638 if (!progress_passed && comp_keys(&disk_key, progress) < 0) 664 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
639 continue; 665 continue;
@@ -656,11 +682,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
656 last_block = blocknr; 682 last_block = blocknr;
657 continue; 683 continue;
658 } 684 }
659 if (parent->map_token) {
660 unmap_extent_buffer(parent, parent->map_token,
661 KM_USER1);
662 parent->map_token = NULL;
663 }
664 685
665 cur = btrfs_find_tree_block(root, blocknr, blocksize); 686 cur = btrfs_find_tree_block(root, blocknr, blocksize);
666 if (cur) 687 if (cur)
@@ -701,11 +722,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
701 btrfs_tree_unlock(cur); 722 btrfs_tree_unlock(cur);
702 free_extent_buffer(cur); 723 free_extent_buffer(cur);
703 } 724 }
704 if (parent->map_token) {
705 unmap_extent_buffer(parent, parent->map_token,
706 KM_USER1);
707 parent->map_token = NULL;
708 }
709 return err; 725 return err;
710} 726}
711 727
@@ -746,7 +762,6 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
746 struct btrfs_disk_key *tmp = NULL; 762 struct btrfs_disk_key *tmp = NULL;
747 struct btrfs_disk_key unaligned; 763 struct btrfs_disk_key unaligned;
748 unsigned long offset; 764 unsigned long offset;
749 char *map_token = NULL;
750 char *kaddr = NULL; 765 char *kaddr = NULL;
751 unsigned long map_start = 0; 766 unsigned long map_start = 0;
752 unsigned long map_len = 0; 767 unsigned long map_len = 0;
@@ -756,18 +771,13 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
756 mid = (low + high) / 2; 771 mid = (low + high) / 2;
757 offset = p + mid * item_size; 772 offset = p + mid * item_size;
758 773
759 if (!map_token || offset < map_start || 774 if (!kaddr || offset < map_start ||
760 (offset + sizeof(struct btrfs_disk_key)) > 775 (offset + sizeof(struct btrfs_disk_key)) >
761 map_start + map_len) { 776 map_start + map_len) {
762 if (map_token) {
763 unmap_extent_buffer(eb, map_token, KM_USER0);
764 map_token = NULL;
765 }
766 777
767 err = map_private_extent_buffer(eb, offset, 778 err = map_private_extent_buffer(eb, offset,
768 sizeof(struct btrfs_disk_key), 779 sizeof(struct btrfs_disk_key),
769 &map_token, &kaddr, 780 &kaddr, &map_start, &map_len);
770 &map_start, &map_len, KM_USER0);
771 781
772 if (!err) { 782 if (!err) {
773 tmp = (struct btrfs_disk_key *)(kaddr + offset - 783 tmp = (struct btrfs_disk_key *)(kaddr + offset -
@@ -790,14 +800,10 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
790 high = mid; 800 high = mid;
791 else { 801 else {
792 *slot = mid; 802 *slot = mid;
793 if (map_token)
794 unmap_extent_buffer(eb, map_token, KM_USER0);
795 return 0; 803 return 0;
796 } 804 }
797 } 805 }
798 *slot = low; 806 *slot = low;
799 if (map_token)
800 unmap_extent_buffer(eb, map_token, KM_USER0);
801 return 1; 807 return 1;
802} 808}
803 809
@@ -890,7 +896,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
890 896
891 mid = path->nodes[level]; 897 mid = path->nodes[level];
892 898
893 WARN_ON(!path->locks[level]); 899 WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
900 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
894 WARN_ON(btrfs_header_generation(mid) != trans->transid); 901 WARN_ON(btrfs_header_generation(mid) != trans->transid);
895 902
896 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
@@ -1228,7 +1235,6 @@ static void reada_for_search(struct btrfs_root *root,
1228 u32 nr; 1235 u32 nr;
1229 u32 blocksize; 1236 u32 blocksize;
1230 u32 nscan = 0; 1237 u32 nscan = 0;
1231 bool map = true;
1232 1238
1233 if (level != 1) 1239 if (level != 1)
1234 return; 1240 return;
@@ -1250,19 +1256,8 @@ static void reada_for_search(struct btrfs_root *root,
1250 1256
1251 nritems = btrfs_header_nritems(node); 1257 nritems = btrfs_header_nritems(node);
1252 nr = slot; 1258 nr = slot;
1253 if (node->map_token || path->skip_locking)
1254 map = false;
1255 1259
1256 while (1) { 1260 while (1) {
1257 if (map && !node->map_token) {
1258 unsigned long offset = btrfs_node_key_ptr_offset(nr);
1259 map_private_extent_buffer(node, offset,
1260 sizeof(struct btrfs_key_ptr),
1261 &node->map_token,
1262 &node->kaddr,
1263 &node->map_start,
1264 &node->map_len, KM_USER1);
1265 }
1266 if (direction < 0) { 1261 if (direction < 0) {
1267 if (nr == 0) 1262 if (nr == 0)
1268 break; 1263 break;
@@ -1281,11 +1276,6 @@ static void reada_for_search(struct btrfs_root *root,
1281 if ((search <= target && target - search <= 65536) || 1276 if ((search <= target && target - search <= 65536) ||
1282 (search > target && search - target <= 65536)) { 1277 (search > target && search - target <= 65536)) {
1283 gen = btrfs_node_ptr_generation(node, nr); 1278 gen = btrfs_node_ptr_generation(node, nr);
1284 if (map && node->map_token) {
1285 unmap_extent_buffer(node, node->map_token,
1286 KM_USER1);
1287 node->map_token = NULL;
1288 }
1289 readahead_tree_block(root, search, blocksize, gen); 1279 readahead_tree_block(root, search, blocksize, gen);
1290 nread += blocksize; 1280 nread += blocksize;
1291 } 1281 }
@@ -1293,10 +1283,6 @@ static void reada_for_search(struct btrfs_root *root,
1293 if ((nread > 65536 || nscan > 32)) 1283 if ((nread > 65536 || nscan > 32))
1294 break; 1284 break;
1295 } 1285 }
1296 if (map && node->map_token) {
1297 unmap_extent_buffer(node, node->map_token, KM_USER1);
1298 node->map_token = NULL;
1299 }
1300} 1286}
1301 1287
1302/* 1288/*
@@ -1409,7 +1395,7 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
1409 1395
1410 t = path->nodes[i]; 1396 t = path->nodes[i];
1411 if (i >= lowest_unlock && i > skip_level && path->locks[i]) { 1397 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
1412 btrfs_tree_unlock(t); 1398 btrfs_tree_unlock_rw(t, path->locks[i]);
1413 path->locks[i] = 0; 1399 path->locks[i] = 0;
1414 } 1400 }
1415 } 1401 }
@@ -1436,7 +1422,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
1436 continue; 1422 continue;
1437 if (!path->locks[i]) 1423 if (!path->locks[i])
1438 continue; 1424 continue;
1439 btrfs_tree_unlock(path->nodes[i]); 1425 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1440 path->locks[i] = 0; 1426 path->locks[i] = 0;
1441 } 1427 }
1442} 1428}
@@ -1485,6 +1471,8 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1485 * we can trust our generation number 1471 * we can trust our generation number
1486 */ 1472 */
1487 free_extent_buffer(tmp); 1473 free_extent_buffer(tmp);
1474 btrfs_set_path_blocking(p);
1475
1488 tmp = read_tree_block(root, blocknr, blocksize, gen); 1476 tmp = read_tree_block(root, blocknr, blocksize, gen);
1489 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1477 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1490 *eb_ret = tmp; 1478 *eb_ret = tmp;
@@ -1540,20 +1528,27 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1540static int 1528static int
1541setup_nodes_for_search(struct btrfs_trans_handle *trans, 1529setup_nodes_for_search(struct btrfs_trans_handle *trans,
1542 struct btrfs_root *root, struct btrfs_path *p, 1530 struct btrfs_root *root, struct btrfs_path *p,
1543 struct extent_buffer *b, int level, int ins_len) 1531 struct extent_buffer *b, int level, int ins_len,
1532 int *write_lock_level)
1544{ 1533{
1545 int ret; 1534 int ret;
1546 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >= 1535 if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1547 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { 1536 BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
1548 int sret; 1537 int sret;
1549 1538
1539 if (*write_lock_level < level + 1) {
1540 *write_lock_level = level + 1;
1541 btrfs_release_path(p);
1542 goto again;
1543 }
1544
1550 sret = reada_for_balance(root, p, level); 1545 sret = reada_for_balance(root, p, level);
1551 if (sret) 1546 if (sret)
1552 goto again; 1547 goto again;
1553 1548
1554 btrfs_set_path_blocking(p); 1549 btrfs_set_path_blocking(p);
1555 sret = split_node(trans, root, p, level); 1550 sret = split_node(trans, root, p, level);
1556 btrfs_clear_path_blocking(p, NULL); 1551 btrfs_clear_path_blocking(p, NULL, 0);
1557 1552
1558 BUG_ON(sret > 0); 1553 BUG_ON(sret > 0);
1559 if (sret) { 1554 if (sret) {
@@ -1565,13 +1560,19 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
1565 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) { 1560 BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
1566 int sret; 1561 int sret;
1567 1562
1563 if (*write_lock_level < level + 1) {
1564 *write_lock_level = level + 1;
1565 btrfs_release_path(p);
1566 goto again;
1567 }
1568
1568 sret = reada_for_balance(root, p, level); 1569 sret = reada_for_balance(root, p, level);
1569 if (sret) 1570 if (sret)
1570 goto again; 1571 goto again;
1571 1572
1572 btrfs_set_path_blocking(p); 1573 btrfs_set_path_blocking(p);
1573 sret = balance_level(trans, root, p, level); 1574 sret = balance_level(trans, root, p, level);
1574 btrfs_clear_path_blocking(p, NULL); 1575 btrfs_clear_path_blocking(p, NULL, 0);
1575 1576
1576 if (sret) { 1577 if (sret) {
1577 ret = sret; 1578 ret = sret;
@@ -1615,27 +1616,78 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
1615 int err; 1616 int err;
1616 int level; 1617 int level;
1617 int lowest_unlock = 1; 1618 int lowest_unlock = 1;
1619 int root_lock;
1620 /* everything at write_lock_level or lower must be write locked */
1621 int write_lock_level = 0;
1618 u8 lowest_level = 0; 1622 u8 lowest_level = 0;
1619 1623
1620 lowest_level = p->lowest_level; 1624 lowest_level = p->lowest_level;
1621 WARN_ON(lowest_level && ins_len > 0); 1625 WARN_ON(lowest_level && ins_len > 0);
1622 WARN_ON(p->nodes[0] != NULL); 1626 WARN_ON(p->nodes[0] != NULL);
1623 1627
1624 if (ins_len < 0) 1628 if (ins_len < 0) {
1625 lowest_unlock = 2; 1629 lowest_unlock = 2;
1626 1630
1631 /* when we are removing items, we might have to go up to level
1632 * two as we update tree pointers Make sure we keep write
1633 * for those levels as well
1634 */
1635 write_lock_level = 2;
1636 } else if (ins_len > 0) {
1637 /*
1638 * for inserting items, make sure we have a write lock on
1639 * level 1 so we can update keys
1640 */
1641 write_lock_level = 1;
1642 }
1643
1644 if (!cow)
1645 write_lock_level = -1;
1646
1647 if (cow && (p->keep_locks || p->lowest_level))
1648 write_lock_level = BTRFS_MAX_LEVEL;
1649
1627again: 1650again:
1651 /*
1652 * we try very hard to do read locks on the root
1653 */
1654 root_lock = BTRFS_READ_LOCK;
1655 level = 0;
1628 if (p->search_commit_root) { 1656 if (p->search_commit_root) {
1657 /*
1658 * the commit roots are read only
1659 * so we always do read locks
1660 */
1629 b = root->commit_root; 1661 b = root->commit_root;
1630 extent_buffer_get(b); 1662 extent_buffer_get(b);
1663 level = btrfs_header_level(b);
1631 if (!p->skip_locking) 1664 if (!p->skip_locking)
1632 btrfs_tree_lock(b); 1665 btrfs_tree_read_lock(b);
1633 } else { 1666 } else {
1634 if (p->skip_locking) 1667 if (p->skip_locking) {
1635 b = btrfs_root_node(root); 1668 b = btrfs_root_node(root);
1636 else 1669 level = btrfs_header_level(b);
1637 b = btrfs_lock_root_node(root); 1670 } else {
1671 /* we don't know the level of the root node
1672 * until we actually have it read locked
1673 */
1674 b = btrfs_read_lock_root_node(root);
1675 level = btrfs_header_level(b);
1676 if (level <= write_lock_level) {
1677 /* whoops, must trade for write lock */
1678 btrfs_tree_read_unlock(b);
1679 free_extent_buffer(b);
1680 b = btrfs_lock_root_node(root);
1681 root_lock = BTRFS_WRITE_LOCK;
1682
1683 /* the level might have changed, check again */
1684 level = btrfs_header_level(b);
1685 }
1686 }
1638 } 1687 }
1688 p->nodes[level] = b;
1689 if (!p->skip_locking)
1690 p->locks[level] = root_lock;
1639 1691
1640 while (b) { 1692 while (b) {
1641 level = btrfs_header_level(b); 1693 level = btrfs_header_level(b);
@@ -1644,10 +1696,6 @@ again:
1644 * setup the path here so we can release it under lock 1696 * setup the path here so we can release it under lock
1645 * contention with the cow code 1697 * contention with the cow code
1646 */ 1698 */
1647 p->nodes[level] = b;
1648 if (!p->skip_locking)
1649 p->locks[level] = 1;
1650
1651 if (cow) { 1699 if (cow) {
1652 /* 1700 /*
1653 * if we don't really need to cow this block 1701 * if we don't really need to cow this block
@@ -1659,6 +1707,16 @@ again:
1659 1707
1660 btrfs_set_path_blocking(p); 1708 btrfs_set_path_blocking(p);
1661 1709
1710 /*
1711 * must have write locks on this node and the
1712 * parent
1713 */
1714 if (level + 1 > write_lock_level) {
1715 write_lock_level = level + 1;
1716 btrfs_release_path(p);
1717 goto again;
1718 }
1719
1662 err = btrfs_cow_block(trans, root, b, 1720 err = btrfs_cow_block(trans, root, b,
1663 p->nodes[level + 1], 1721 p->nodes[level + 1],
1664 p->slots[level + 1], &b); 1722 p->slots[level + 1], &b);
@@ -1671,10 +1729,7 @@ cow_done:
1671 BUG_ON(!cow && ins_len); 1729 BUG_ON(!cow && ins_len);
1672 1730
1673 p->nodes[level] = b; 1731 p->nodes[level] = b;
1674 if (!p->skip_locking) 1732 btrfs_clear_path_blocking(p, NULL, 0);
1675 p->locks[level] = 1;
1676
1677 btrfs_clear_path_blocking(p, NULL);
1678 1733
1679 /* 1734 /*
1680 * we have a lock on b and as long as we aren't changing 1735 * we have a lock on b and as long as we aren't changing
@@ -1700,7 +1755,7 @@ cow_done:
1700 } 1755 }
1701 p->slots[level] = slot; 1756 p->slots[level] = slot;
1702 err = setup_nodes_for_search(trans, root, p, b, level, 1757 err = setup_nodes_for_search(trans, root, p, b, level,
1703 ins_len); 1758 ins_len, &write_lock_level);
1704 if (err == -EAGAIN) 1759 if (err == -EAGAIN)
1705 goto again; 1760 goto again;
1706 if (err) { 1761 if (err) {
@@ -1710,6 +1765,19 @@ cow_done:
1710 b = p->nodes[level]; 1765 b = p->nodes[level];
1711 slot = p->slots[level]; 1766 slot = p->slots[level];
1712 1767
1768 /*
1769 * slot 0 is special, if we change the key
1770 * we have to update the parent pointer
1771 * which means we must have a write lock
1772 * on the parent
1773 */
1774 if (slot == 0 && cow &&
1775 write_lock_level < level + 1) {
1776 write_lock_level = level + 1;
1777 btrfs_release_path(p);
1778 goto again;
1779 }
1780
1713 unlock_up(p, level, lowest_unlock); 1781 unlock_up(p, level, lowest_unlock);
1714 1782
1715 if (level == lowest_level) { 1783 if (level == lowest_level) {
@@ -1728,23 +1796,42 @@ cow_done:
1728 } 1796 }
1729 1797
1730 if (!p->skip_locking) { 1798 if (!p->skip_locking) {
1731 btrfs_clear_path_blocking(p, NULL); 1799 level = btrfs_header_level(b);
1732 err = btrfs_try_spin_lock(b); 1800 if (level <= write_lock_level) {
1733 1801 err = btrfs_try_tree_write_lock(b);
1734 if (!err) { 1802 if (!err) {
1735 btrfs_set_path_blocking(p); 1803 btrfs_set_path_blocking(p);
1736 btrfs_tree_lock(b); 1804 btrfs_tree_lock(b);
1737 btrfs_clear_path_blocking(p, b); 1805 btrfs_clear_path_blocking(p, b,
1806 BTRFS_WRITE_LOCK);
1807 }
1808 p->locks[level] = BTRFS_WRITE_LOCK;
1809 } else {
1810 err = btrfs_try_tree_read_lock(b);
1811 if (!err) {
1812 btrfs_set_path_blocking(p);
1813 btrfs_tree_read_lock(b);
1814 btrfs_clear_path_blocking(p, b,
1815 BTRFS_READ_LOCK);
1816 }
1817 p->locks[level] = BTRFS_READ_LOCK;
1738 } 1818 }
1819 p->nodes[level] = b;
1739 } 1820 }
1740 } else { 1821 } else {
1741 p->slots[level] = slot; 1822 p->slots[level] = slot;
1742 if (ins_len > 0 && 1823 if (ins_len > 0 &&
1743 btrfs_leaf_free_space(root, b) < ins_len) { 1824 btrfs_leaf_free_space(root, b) < ins_len) {
1825 if (write_lock_level < 1) {
1826 write_lock_level = 1;
1827 btrfs_release_path(p);
1828 goto again;
1829 }
1830
1744 btrfs_set_path_blocking(p); 1831 btrfs_set_path_blocking(p);
1745 err = split_leaf(trans, root, key, 1832 err = split_leaf(trans, root, key,
1746 p, ins_len, ret == 0); 1833 p, ins_len, ret == 0);
1747 btrfs_clear_path_blocking(p, NULL); 1834 btrfs_clear_path_blocking(p, NULL, 0);
1748 1835
1749 BUG_ON(err > 0); 1836 BUG_ON(err > 0);
1750 if (err) { 1837 if (err) {
@@ -2025,7 +2112,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2025 add_root_to_dirty_list(root); 2112 add_root_to_dirty_list(root);
2026 extent_buffer_get(c); 2113 extent_buffer_get(c);
2027 path->nodes[level] = c; 2114 path->nodes[level] = c;
2028 path->locks[level] = 1; 2115 path->locks[level] = BTRFS_WRITE_LOCK;
2029 path->slots[level] = 0; 2116 path->slots[level] = 0;
2030 return 0; 2117 return 0;
2031} 2118}
@@ -2253,14 +2340,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2253 if (path->slots[0] == i) 2340 if (path->slots[0] == i)
2254 push_space += data_size; 2341 push_space += data_size;
2255 2342
2256 if (!left->map_token) {
2257 map_extent_buffer(left, (unsigned long)item,
2258 sizeof(struct btrfs_item),
2259 &left->map_token, &left->kaddr,
2260 &left->map_start, &left->map_len,
2261 KM_USER1);
2262 }
2263
2264 this_item_size = btrfs_item_size(left, item); 2343 this_item_size = btrfs_item_size(left, item);
2265 if (this_item_size + sizeof(*item) + push_space > free_space) 2344 if (this_item_size + sizeof(*item) + push_space > free_space)
2266 break; 2345 break;
@@ -2271,10 +2350,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2271 break; 2350 break;
2272 i--; 2351 i--;
2273 } 2352 }
2274 if (left->map_token) {
2275 unmap_extent_buffer(left, left->map_token, KM_USER1);
2276 left->map_token = NULL;
2277 }
2278 2353
2279 if (push_items == 0) 2354 if (push_items == 0)
2280 goto out_unlock; 2355 goto out_unlock;
@@ -2316,21 +2391,10 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2316 push_space = BTRFS_LEAF_DATA_SIZE(root); 2391 push_space = BTRFS_LEAF_DATA_SIZE(root);
2317 for (i = 0; i < right_nritems; i++) { 2392 for (i = 0; i < right_nritems; i++) {
2318 item = btrfs_item_nr(right, i); 2393 item = btrfs_item_nr(right, i);
2319 if (!right->map_token) {
2320 map_extent_buffer(right, (unsigned long)item,
2321 sizeof(struct btrfs_item),
2322 &right->map_token, &right->kaddr,
2323 &right->map_start, &right->map_len,
2324 KM_USER1);
2325 }
2326 push_space -= btrfs_item_size(right, item); 2394 push_space -= btrfs_item_size(right, item);
2327 btrfs_set_item_offset(right, item, push_space); 2395 btrfs_set_item_offset(right, item, push_space);
2328 } 2396 }
2329 2397
2330 if (right->map_token) {
2331 unmap_extent_buffer(right, right->map_token, KM_USER1);
2332 right->map_token = NULL;
2333 }
2334 left_nritems -= push_items; 2398 left_nritems -= push_items;
2335 btrfs_set_header_nritems(left, left_nritems); 2399 btrfs_set_header_nritems(left, left_nritems);
2336 2400
@@ -2467,13 +2531,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2467 2531
2468 for (i = 0; i < nr; i++) { 2532 for (i = 0; i < nr; i++) {
2469 item = btrfs_item_nr(right, i); 2533 item = btrfs_item_nr(right, i);
2470 if (!right->map_token) {
2471 map_extent_buffer(right, (unsigned long)item,
2472 sizeof(struct btrfs_item),
2473 &right->map_token, &right->kaddr,
2474 &right->map_start, &right->map_len,
2475 KM_USER1);
2476 }
2477 2534
2478 if (!empty && push_items > 0) { 2535 if (!empty && push_items > 0) {
2479 if (path->slots[0] < i) 2536 if (path->slots[0] < i)
@@ -2496,11 +2553,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2496 push_space += this_item_size + sizeof(*item); 2553 push_space += this_item_size + sizeof(*item);
2497 } 2554 }
2498 2555
2499 if (right->map_token) {
2500 unmap_extent_buffer(right, right->map_token, KM_USER1);
2501 right->map_token = NULL;
2502 }
2503
2504 if (push_items == 0) { 2556 if (push_items == 0) {
2505 ret = 1; 2557 ret = 1;
2506 goto out; 2558 goto out;
@@ -2530,23 +2582,12 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2530 u32 ioff; 2582 u32 ioff;
2531 2583
2532 item = btrfs_item_nr(left, i); 2584 item = btrfs_item_nr(left, i);
2533 if (!left->map_token) {
2534 map_extent_buffer(left, (unsigned long)item,
2535 sizeof(struct btrfs_item),
2536 &left->map_token, &left->kaddr,
2537 &left->map_start, &left->map_len,
2538 KM_USER1);
2539 }
2540 2585
2541 ioff = btrfs_item_offset(left, item); 2586 ioff = btrfs_item_offset(left, item);
2542 btrfs_set_item_offset(left, item, 2587 btrfs_set_item_offset(left, item,
2543 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); 2588 ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
2544 } 2589 }
2545 btrfs_set_header_nritems(left, old_left_nritems + push_items); 2590 btrfs_set_header_nritems(left, old_left_nritems + push_items);
2546 if (left->map_token) {
2547 unmap_extent_buffer(left, left->map_token, KM_USER1);
2548 left->map_token = NULL;
2549 }
2550 2591
2551 /* fixup right node */ 2592 /* fixup right node */
2552 if (push_items > right_nritems) { 2593 if (push_items > right_nritems) {
@@ -2574,21 +2615,9 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2574 for (i = 0; i < right_nritems; i++) { 2615 for (i = 0; i < right_nritems; i++) {
2575 item = btrfs_item_nr(right, i); 2616 item = btrfs_item_nr(right, i);
2576 2617
2577 if (!right->map_token) {
2578 map_extent_buffer(right, (unsigned long)item,
2579 sizeof(struct btrfs_item),
2580 &right->map_token, &right->kaddr,
2581 &right->map_start, &right->map_len,
2582 KM_USER1);
2583 }
2584
2585 push_space = push_space - btrfs_item_size(right, item); 2618 push_space = push_space - btrfs_item_size(right, item);
2586 btrfs_set_item_offset(right, item, push_space); 2619 btrfs_set_item_offset(right, item, push_space);
2587 } 2620 }
2588 if (right->map_token) {
2589 unmap_extent_buffer(right, right->map_token, KM_USER1);
2590 right->map_token = NULL;
2591 }
2592 2621
2593 btrfs_mark_buffer_dirty(left); 2622 btrfs_mark_buffer_dirty(left);
2594 if (right_nritems) 2623 if (right_nritems)
@@ -2729,23 +2758,10 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2729 struct btrfs_item *item = btrfs_item_nr(right, i); 2758 struct btrfs_item *item = btrfs_item_nr(right, i);
2730 u32 ioff; 2759 u32 ioff;
2731 2760
2732 if (!right->map_token) {
2733 map_extent_buffer(right, (unsigned long)item,
2734 sizeof(struct btrfs_item),
2735 &right->map_token, &right->kaddr,
2736 &right->map_start, &right->map_len,
2737 KM_USER1);
2738 }
2739
2740 ioff = btrfs_item_offset(right, item); 2761 ioff = btrfs_item_offset(right, item);
2741 btrfs_set_item_offset(right, item, ioff + rt_data_off); 2762 btrfs_set_item_offset(right, item, ioff + rt_data_off);
2742 } 2763 }
2743 2764
2744 if (right->map_token) {
2745 unmap_extent_buffer(right, right->map_token, KM_USER1);
2746 right->map_token = NULL;
2747 }
2748
2749 btrfs_set_header_nritems(l, mid); 2765 btrfs_set_header_nritems(l, mid);
2750 ret = 0; 2766 ret = 0;
2751 btrfs_item_key(right, &disk_key, 0); 2767 btrfs_item_key(right, &disk_key, 0);
@@ -3264,23 +3280,10 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3264 u32 ioff; 3280 u32 ioff;
3265 item = btrfs_item_nr(leaf, i); 3281 item = btrfs_item_nr(leaf, i);
3266 3282
3267 if (!leaf->map_token) {
3268 map_extent_buffer(leaf, (unsigned long)item,
3269 sizeof(struct btrfs_item),
3270 &leaf->map_token, &leaf->kaddr,
3271 &leaf->map_start, &leaf->map_len,
3272 KM_USER1);
3273 }
3274
3275 ioff = btrfs_item_offset(leaf, item); 3283 ioff = btrfs_item_offset(leaf, item);
3276 btrfs_set_item_offset(leaf, item, ioff + size_diff); 3284 btrfs_set_item_offset(leaf, item, ioff + size_diff);
3277 } 3285 }
3278 3286
3279 if (leaf->map_token) {
3280 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3281 leaf->map_token = NULL;
3282 }
3283
3284 /* shift the data */ 3287 /* shift the data */
3285 if (from_end) { 3288 if (from_end) {
3286 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 3289 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
@@ -3377,22 +3380,10 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3377 u32 ioff; 3380 u32 ioff;
3378 item = btrfs_item_nr(leaf, i); 3381 item = btrfs_item_nr(leaf, i);
3379 3382
3380 if (!leaf->map_token) {
3381 map_extent_buffer(leaf, (unsigned long)item,
3382 sizeof(struct btrfs_item),
3383 &leaf->map_token, &leaf->kaddr,
3384 &leaf->map_start, &leaf->map_len,
3385 KM_USER1);
3386 }
3387 ioff = btrfs_item_offset(leaf, item); 3383 ioff = btrfs_item_offset(leaf, item);
3388 btrfs_set_item_offset(leaf, item, ioff - data_size); 3384 btrfs_set_item_offset(leaf, item, ioff - data_size);
3389 } 3385 }
3390 3386
3391 if (leaf->map_token) {
3392 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3393 leaf->map_token = NULL;
3394 }
3395
3396 /* shift the data */ 3387 /* shift the data */
3397 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + 3388 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
3398 data_end - data_size, btrfs_leaf_data(leaf) + 3389 data_end - data_size, btrfs_leaf_data(leaf) +
@@ -3494,27 +3485,13 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
3494 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3485 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3495 */ 3486 */
3496 /* first correct the data pointers */ 3487 /* first correct the data pointers */
3497 WARN_ON(leaf->map_token);
3498 for (i = slot; i < nritems; i++) { 3488 for (i = slot; i < nritems; i++) {
3499 u32 ioff; 3489 u32 ioff;
3500 3490
3501 item = btrfs_item_nr(leaf, i); 3491 item = btrfs_item_nr(leaf, i);
3502 if (!leaf->map_token) {
3503 map_extent_buffer(leaf, (unsigned long)item,
3504 sizeof(struct btrfs_item),
3505 &leaf->map_token, &leaf->kaddr,
3506 &leaf->map_start, &leaf->map_len,
3507 KM_USER1);
3508 }
3509
3510 ioff = btrfs_item_offset(leaf, item); 3492 ioff = btrfs_item_offset(leaf, item);
3511 btrfs_set_item_offset(leaf, item, ioff - total_data); 3493 btrfs_set_item_offset(leaf, item, ioff - total_data);
3512 } 3494 }
3513 if (leaf->map_token) {
3514 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3515 leaf->map_token = NULL;
3516 }
3517
3518 /* shift the items */ 3495 /* shift the items */
3519 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 3496 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3520 btrfs_item_nr_offset(slot), 3497 btrfs_item_nr_offset(slot),
@@ -3608,27 +3585,13 @@ int setup_items_for_insert(struct btrfs_trans_handle *trans,
3608 * item0..itemN ... dataN.offset..dataN.size .. data0.size 3585 * item0..itemN ... dataN.offset..dataN.size .. data0.size
3609 */ 3586 */
3610 /* first correct the data pointers */ 3587 /* first correct the data pointers */
3611 WARN_ON(leaf->map_token);
3612 for (i = slot; i < nritems; i++) { 3588 for (i = slot; i < nritems; i++) {
3613 u32 ioff; 3589 u32 ioff;
3614 3590
3615 item = btrfs_item_nr(leaf, i); 3591 item = btrfs_item_nr(leaf, i);
3616 if (!leaf->map_token) {
3617 map_extent_buffer(leaf, (unsigned long)item,
3618 sizeof(struct btrfs_item),
3619 &leaf->map_token, &leaf->kaddr,
3620 &leaf->map_start, &leaf->map_len,
3621 KM_USER1);
3622 }
3623
3624 ioff = btrfs_item_offset(leaf, item); 3592 ioff = btrfs_item_offset(leaf, item);
3625 btrfs_set_item_offset(leaf, item, ioff - total_data); 3593 btrfs_set_item_offset(leaf, item, ioff - total_data);
3626 } 3594 }
3627 if (leaf->map_token) {
3628 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3629 leaf->map_token = NULL;
3630 }
3631
3632 /* shift the items */ 3595 /* shift the items */
3633 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), 3596 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
3634 btrfs_item_nr_offset(slot), 3597 btrfs_item_nr_offset(slot),
@@ -3840,22 +3803,10 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3840 u32 ioff; 3803 u32 ioff;
3841 3804
3842 item = btrfs_item_nr(leaf, i); 3805 item = btrfs_item_nr(leaf, i);
3843 if (!leaf->map_token) {
3844 map_extent_buffer(leaf, (unsigned long)item,
3845 sizeof(struct btrfs_item),
3846 &leaf->map_token, &leaf->kaddr,
3847 &leaf->map_start, &leaf->map_len,
3848 KM_USER1);
3849 }
3850 ioff = btrfs_item_offset(leaf, item); 3806 ioff = btrfs_item_offset(leaf, item);
3851 btrfs_set_item_offset(leaf, item, ioff + dsize); 3807 btrfs_set_item_offset(leaf, item, ioff + dsize);
3852 } 3808 }
3853 3809
3854 if (leaf->map_token) {
3855 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
3856 leaf->map_token = NULL;
3857 }
3858
3859 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), 3810 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
3860 btrfs_item_nr_offset(slot + nr), 3811 btrfs_item_nr_offset(slot + nr),
3861 sizeof(struct btrfs_item) * 3812 sizeof(struct btrfs_item) *
@@ -4004,11 +3955,11 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4004 3955
4005 WARN_ON(!path->keep_locks); 3956 WARN_ON(!path->keep_locks);
4006again: 3957again:
4007 cur = btrfs_lock_root_node(root); 3958 cur = btrfs_read_lock_root_node(root);
4008 level = btrfs_header_level(cur); 3959 level = btrfs_header_level(cur);
4009 WARN_ON(path->nodes[level]); 3960 WARN_ON(path->nodes[level]);
4010 path->nodes[level] = cur; 3961 path->nodes[level] = cur;
4011 path->locks[level] = 1; 3962 path->locks[level] = BTRFS_READ_LOCK;
4012 3963
4013 if (btrfs_header_generation(cur) < min_trans) { 3964 if (btrfs_header_generation(cur) < min_trans) {
4014 ret = 1; 3965 ret = 1;
@@ -4098,12 +4049,12 @@ find_next_key:
4098 cur = read_node_slot(root, cur, slot); 4049 cur = read_node_slot(root, cur, slot);
4099 BUG_ON(!cur); 4050 BUG_ON(!cur);
4100 4051
4101 btrfs_tree_lock(cur); 4052 btrfs_tree_read_lock(cur);
4102 4053
4103 path->locks[level - 1] = 1; 4054 path->locks[level - 1] = BTRFS_READ_LOCK;
4104 path->nodes[level - 1] = cur; 4055 path->nodes[level - 1] = cur;
4105 unlock_up(path, level, 1); 4056 unlock_up(path, level, 1);
4106 btrfs_clear_path_blocking(path, NULL); 4057 btrfs_clear_path_blocking(path, NULL, 0);
4107 } 4058 }
4108out: 4059out:
4109 if (ret == 0) 4060 if (ret == 0)
@@ -4218,30 +4169,21 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4218 u32 nritems; 4169 u32 nritems;
4219 int ret; 4170 int ret;
4220 int old_spinning = path->leave_spinning; 4171 int old_spinning = path->leave_spinning;
4221 int force_blocking = 0; 4172 int next_rw_lock = 0;
4222 4173
4223 nritems = btrfs_header_nritems(path->nodes[0]); 4174 nritems = btrfs_header_nritems(path->nodes[0]);
4224 if (nritems == 0) 4175 if (nritems == 0)
4225 return 1; 4176 return 1;
4226 4177
4227 /*
4228 * we take the blocks in an order that upsets lockdep. Using
4229 * blocking mode is the only way around it.
4230 */
4231#ifdef CONFIG_DEBUG_LOCK_ALLOC
4232 force_blocking = 1;
4233#endif
4234
4235 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); 4178 btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4236again: 4179again:
4237 level = 1; 4180 level = 1;
4238 next = NULL; 4181 next = NULL;
4182 next_rw_lock = 0;
4239 btrfs_release_path(path); 4183 btrfs_release_path(path);
4240 4184
4241 path->keep_locks = 1; 4185 path->keep_locks = 1;
4242 4186 path->leave_spinning = 1;
4243 if (!force_blocking)
4244 path->leave_spinning = 1;
4245 4187
4246 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4188 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4247 path->keep_locks = 0; 4189 path->keep_locks = 0;
@@ -4281,11 +4223,12 @@ again:
4281 } 4223 }
4282 4224
4283 if (next) { 4225 if (next) {
4284 btrfs_tree_unlock(next); 4226 btrfs_tree_unlock_rw(next, next_rw_lock);
4285 free_extent_buffer(next); 4227 free_extent_buffer(next);
4286 } 4228 }
4287 4229
4288 next = c; 4230 next = c;
4231 next_rw_lock = path->locks[level];
4289 ret = read_block_for_search(NULL, root, path, &next, level, 4232 ret = read_block_for_search(NULL, root, path, &next, level,
4290 slot, &key); 4233 slot, &key);
4291 if (ret == -EAGAIN) 4234 if (ret == -EAGAIN)
@@ -4297,15 +4240,14 @@ again:
4297 } 4240 }
4298 4241
4299 if (!path->skip_locking) { 4242 if (!path->skip_locking) {
4300 ret = btrfs_try_spin_lock(next); 4243 ret = btrfs_try_tree_read_lock(next);
4301 if (!ret) { 4244 if (!ret) {
4302 btrfs_set_path_blocking(path); 4245 btrfs_set_path_blocking(path);
4303 btrfs_tree_lock(next); 4246 btrfs_tree_read_lock(next);
4304 if (!force_blocking) 4247 btrfs_clear_path_blocking(path, next,
4305 btrfs_clear_path_blocking(path, next); 4248 BTRFS_READ_LOCK);
4306 } 4249 }
4307 if (force_blocking) 4250 next_rw_lock = BTRFS_READ_LOCK;
4308 btrfs_set_lock_blocking(next);
4309 } 4251 }
4310 break; 4252 break;
4311 } 4253 }
@@ -4314,14 +4256,13 @@ again:
4314 level--; 4256 level--;
4315 c = path->nodes[level]; 4257 c = path->nodes[level];
4316 if (path->locks[level]) 4258 if (path->locks[level])
4317 btrfs_tree_unlock(c); 4259 btrfs_tree_unlock_rw(c, path->locks[level]);
4318 4260
4319 free_extent_buffer(c); 4261 free_extent_buffer(c);
4320 path->nodes[level] = next; 4262 path->nodes[level] = next;
4321 path->slots[level] = 0; 4263 path->slots[level] = 0;
4322 if (!path->skip_locking) 4264 if (!path->skip_locking)
4323 path->locks[level] = 1; 4265 path->locks[level] = next_rw_lock;
4324
4325 if (!level) 4266 if (!level)
4326 break; 4267 break;
4327 4268
@@ -4336,16 +4277,14 @@ again:
4336 } 4277 }
4337 4278
4338 if (!path->skip_locking) { 4279 if (!path->skip_locking) {
4339 btrfs_assert_tree_locked(path->nodes[level]); 4280 ret = btrfs_try_tree_read_lock(next);
4340 ret = btrfs_try_spin_lock(next);
4341 if (!ret) { 4281 if (!ret) {
4342 btrfs_set_path_blocking(path); 4282 btrfs_set_path_blocking(path);
4343 btrfs_tree_lock(next); 4283 btrfs_tree_read_lock(next);
4344 if (!force_blocking) 4284 btrfs_clear_path_blocking(path, next,
4345 btrfs_clear_path_blocking(path, next); 4285 BTRFS_READ_LOCK);
4346 } 4286 }
4347 if (force_blocking) 4287 next_rw_lock = BTRFS_READ_LOCK;
4348 btrfs_set_lock_blocking(next);
4349 } 4288 }
4350 } 4289 }
4351 ret = 0; 4290 ret = 0;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3b859a3e6a0..03912c5c6f4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -755,6 +755,8 @@ struct btrfs_space_info {
755 chunks for this space */ 755 chunks for this space */
756 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */ 756 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
757 757
758 unsigned int flush:1; /* set if we are trying to make space */
759
758 unsigned int force_alloc; /* set if we need to force a chunk 760 unsigned int force_alloc; /* set if we need to force a chunk
759 alloc for this space */ 761 alloc for this space */
760 762
@@ -764,7 +766,7 @@ struct btrfs_space_info {
764 struct list_head block_groups[BTRFS_NR_RAID_TYPES]; 766 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
765 spinlock_t lock; 767 spinlock_t lock;
766 struct rw_semaphore groups_sem; 768 struct rw_semaphore groups_sem;
767 atomic_t caching_threads; 769 wait_queue_head_t wait;
768}; 770};
769 771
770struct btrfs_block_rsv { 772struct btrfs_block_rsv {
@@ -824,6 +826,7 @@ struct btrfs_caching_control {
824 struct list_head list; 826 struct list_head list;
825 struct mutex mutex; 827 struct mutex mutex;
826 wait_queue_head_t wait; 828 wait_queue_head_t wait;
829 struct btrfs_work work;
827 struct btrfs_block_group_cache *block_group; 830 struct btrfs_block_group_cache *block_group;
828 u64 progress; 831 u64 progress;
829 atomic_t count; 832 atomic_t count;
@@ -1032,6 +1035,8 @@ struct btrfs_fs_info {
1032 struct btrfs_workers endio_write_workers; 1035 struct btrfs_workers endio_write_workers;
1033 struct btrfs_workers endio_freespace_worker; 1036 struct btrfs_workers endio_freespace_worker;
1034 struct btrfs_workers submit_workers; 1037 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers;
1039
1035 /* 1040 /*
1036 * fixup workers take dirty pages that didn't properly go through 1041 * fixup workers take dirty pages that didn't properly go through
1037 * the cow mechanism and make them safe to write. It happens 1042 * the cow mechanism and make them safe to write. It happens
@@ -1219,7 +1224,7 @@ struct btrfs_root {
1219 * right now this just gets used so that a root has its own devid 1224 * right now this just gets used so that a root has its own devid
1220 * for stat. It may be used for more later 1225 * for stat. It may be used for more later
1221 */ 1226 */
1222 struct super_block anon_super; 1227 dev_t anon_dev;
1223}; 1228};
1224 1229
1225struct btrfs_ioctl_defrag_range_args { 1230struct btrfs_ioctl_defrag_range_args {
@@ -1410,17 +1415,15 @@ void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
1410#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ 1415#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
1411static inline u##bits btrfs_##name(struct extent_buffer *eb) \ 1416static inline u##bits btrfs_##name(struct extent_buffer *eb) \
1412{ \ 1417{ \
1413 type *p = kmap_atomic(eb->first_page, KM_USER0); \ 1418 type *p = page_address(eb->first_page); \
1414 u##bits res = le##bits##_to_cpu(p->member); \ 1419 u##bits res = le##bits##_to_cpu(p->member); \
1415 kunmap_atomic(p, KM_USER0); \
1416 return res; \ 1420 return res; \
1417} \ 1421} \
1418static inline void btrfs_set_##name(struct extent_buffer *eb, \ 1422static inline void btrfs_set_##name(struct extent_buffer *eb, \
1419 u##bits val) \ 1423 u##bits val) \
1420{ \ 1424{ \
1421 type *p = kmap_atomic(eb->first_page, KM_USER0); \ 1425 type *p = page_address(eb->first_page); \
1422 p->member = cpu_to_le##bits(val); \ 1426 p->member = cpu_to_le##bits(val); \
1423 kunmap_atomic(p, KM_USER0); \
1424} 1427}
1425 1428
1426#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ 1429#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \
@@ -2128,7 +2131,7 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2128 2131
2129/* extent-tree.c */ 2132/* extent-tree.c */
2130static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2131 int num_items) 2134 unsigned num_items)
2132{ 2135{
2133 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 2136 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2134 3 * num_items; 2137 3 * num_items;
@@ -2222,9 +2225,6 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2222void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2225void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2223int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2226int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2224void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2227void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2225int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2226 struct btrfs_root *root,
2227 int num_items);
2228void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 2228void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2229 struct btrfs_root *root); 2229 struct btrfs_root *root);
2230int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 2230int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2330,7 +2330,7 @@ struct btrfs_path *btrfs_alloc_path(void);
2330void btrfs_free_path(struct btrfs_path *p); 2330void btrfs_free_path(struct btrfs_path *p);
2331void btrfs_set_path_blocking(struct btrfs_path *p); 2331void btrfs_set_path_blocking(struct btrfs_path *p);
2332void btrfs_clear_path_blocking(struct btrfs_path *p, 2332void btrfs_clear_path_blocking(struct btrfs_path *p,
2333 struct extent_buffer *held); 2333 struct extent_buffer *held, int held_rw);
2334void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2334void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
2335 2335
2336int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2336int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2365,8 +2365,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2365int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2365int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2366int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2366int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2367int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2367int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2368int btrfs_drop_snapshot(struct btrfs_root *root, 2368void btrfs_drop_snapshot(struct btrfs_root *root,
2369 struct btrfs_block_rsv *block_rsv, int update_ref); 2369 struct btrfs_block_rsv *block_rsv, int update_ref);
2370int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2370int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2371 struct btrfs_root *root, 2371 struct btrfs_root *root,
2372 struct extent_buffer *node, 2372 struct extent_buffer *node,
@@ -2404,8 +2404,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2404 btrfs_root_item *item, struct btrfs_key *key); 2404 btrfs_root_item *item, struct btrfs_key *key);
2405int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 2405int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2406int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 2406int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2407int btrfs_set_root_node(struct btrfs_root_item *item, 2407void btrfs_set_root_node(struct btrfs_root_item *item,
2408 struct extent_buffer *node); 2408 struct extent_buffer *node);
2409void btrfs_check_and_init_root_item(struct btrfs_root_item *item); 2409void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
2410 2410
2411/* dir-item.c */ 2411/* dir-item.c */
@@ -2510,6 +2510,9 @@ int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
2510int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 2510int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
2511 struct list_head *list, int search_commit); 2511 struct list_head *list, int search_commit);
2512/* inode.c */ 2512/* inode.c */
2513struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
2514 size_t pg_offset, u64 start, u64 len,
2515 int create);
2513 2516
2514/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ 2517/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
2515#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) 2518#if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -2518,6 +2521,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
2518#define PageChecked PageFsMisc 2521#define PageChecked PageFsMisc
2519#endif 2522#endif
2520 2523
2524/* This forces readahead on a given range of bytes in an inode */
2525static inline void btrfs_force_ra(struct address_space *mapping,
2526 struct file_ra_state *ra, struct file *file,
2527 pgoff_t offset, unsigned long req_size)
2528{
2529 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
2530}
2531
2521struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); 2532struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
2522int btrfs_set_inode_index(struct inode *dir, u64 *index); 2533int btrfs_set_inode_index(struct inode *dir, u64 *index);
2523int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2534int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
@@ -2546,9 +2557,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2546int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 2557int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2547 size_t size, struct bio *bio, unsigned long bio_flags); 2558 size_t size, struct bio *bio, unsigned long bio_flags);
2548 2559
2549unsigned long btrfs_force_ra(struct address_space *mapping,
2550 struct file_ra_state *ra, struct file *file,
2551 pgoff_t offset, pgoff_t last_index);
2552int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2560int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2553int btrfs_readpage(struct file *file, struct page *page); 2561int btrfs_readpage(struct file *file, struct page *page);
2554void btrfs_evict_inode(struct inode *inode); 2562void btrfs_evict_inode(struct inode *inode);
@@ -2602,7 +2610,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
2602int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, 2610int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
2603 struct inode *inode); 2611 struct inode *inode);
2604int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); 2612int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
2605int btrfs_sync_file(struct file *file, int datasync); 2613int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
2606int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2614int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2607 int skip_pinned); 2615 int skip_pinned);
2608extern const struct file_operations btrfs_file_operations; 2616extern const struct file_operations btrfs_file_operations;
@@ -2642,13 +2650,22 @@ do { \
2642 2650
2643/* acl.c */ 2651/* acl.c */
2644#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2652#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2645int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags); 2653struct posix_acl *btrfs_get_acl(struct inode *inode, int type);
2646#else
2647#define btrfs_check_acl NULL
2648#endif
2649int btrfs_init_acl(struct btrfs_trans_handle *trans, 2654int btrfs_init_acl(struct btrfs_trans_handle *trans,
2650 struct inode *inode, struct inode *dir); 2655 struct inode *inode, struct inode *dir);
2651int btrfs_acl_chmod(struct inode *inode); 2656int btrfs_acl_chmod(struct inode *inode);
2657#else
2658#define btrfs_get_acl NULL
2659static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
2660 struct inode *inode, struct inode *dir)
2661{
2662 return 0;
2663}
2664static inline int btrfs_acl_chmod(struct inode *inode)
2665{
2666 return 0;
2667}
2668#endif
2652 2669
2653/* relocation.c */ 2670/* relocation.c */
2654int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); 2671int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 98c68e658a9..b52c672f4c1 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -735,7 +735,7 @@ static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
735 } 735 }
736 736
737 /* reset all the locked nodes in the patch to spinning locks. */ 737 /* reset all the locked nodes in the patch to spinning locks. */
738 btrfs_clear_path_blocking(path, NULL); 738 btrfs_clear_path_blocking(path, NULL, 0);
739 739
740 /* insert the keys of the items */ 740 /* insert the keys of the items */
741 ret = setup_items_for_insert(trans, root, path, keys, data_size, 741 ret = setup_items_for_insert(trans, root, path, keys, data_size,
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 8d27af4bd8b..7083d08b2a2 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -25,7 +25,7 @@
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <linux/list.h> 26#include <linux/list.h>
27#include <linux/wait.h> 27#include <linux/wait.h>
28#include <asm/atomic.h> 28#include <linux/atomic.h>
29 29
30#include "ctree.h" 30#include "ctree.h"
31 31
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 685f2593c4f..31d84e78129 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -89,13 +89,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
89 data_size = sizeof(*dir_item) + name_len + data_len; 89 data_size = sizeof(*dir_item) + name_len + data_len;
90 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 90 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
91 name, name_len); 91 name, name_len);
92 /* 92 if (IS_ERR(dir_item))
93 * FIXME: at some point we should handle xattr's that are larger than 93 return PTR_ERR(dir_item);
94 * what we can fit in our leaf. We set location to NULL b/c we arent
95 * pointing at anything else, that will change if we store the xattr
96 * data in a separate inode.
97 */
98 BUG_ON(IS_ERR(dir_item));
99 memset(&location, 0, sizeof(location)); 94 memset(&location, 0, sizeof(location));
100 95
101 leaf = path->nodes[0]; 96 leaf = path->nodes[0];
@@ -203,8 +198,6 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
203 struct btrfs_key key; 198 struct btrfs_key key;
204 int ins_len = mod < 0 ? -1 : 0; 199 int ins_len = mod < 0 ? -1 : 0;
205 int cow = mod != 0; 200 int cow = mod != 0;
206 struct btrfs_key found_key;
207 struct extent_buffer *leaf;
208 201
209 key.objectid = dir; 202 key.objectid = dir;
210 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 203 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
@@ -214,18 +207,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
214 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 207 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
215 if (ret < 0) 208 if (ret < 0)
216 return ERR_PTR(ret); 209 return ERR_PTR(ret);
217 if (ret > 0) { 210 if (ret > 0)
218 if (path->slots[0] == 0)
219 return NULL;
220 path->slots[0]--;
221 }
222
223 leaf = path->nodes[0];
224 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
225
226 if (found_key.objectid != dir ||
227 btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY ||
228 found_key.offset != key.offset)
229 return NULL; 211 return NULL;
230 212
231 return btrfs_match_dir_item_name(root, path, name, name_len); 213 return btrfs_match_dir_item_name(root, path, name, name_len);
@@ -320,8 +302,6 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
320 struct btrfs_key key; 302 struct btrfs_key key;
321 int ins_len = mod < 0 ? -1 : 0; 303 int ins_len = mod < 0 ? -1 : 0;
322 int cow = mod != 0; 304 int cow = mod != 0;
323 struct btrfs_key found_key;
324 struct extent_buffer *leaf;
325 305
326 key.objectid = dir; 306 key.objectid = dir;
327 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 307 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
@@ -329,18 +309,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
329 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); 309 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
330 if (ret < 0) 310 if (ret < 0)
331 return ERR_PTR(ret); 311 return ERR_PTR(ret);
332 if (ret > 0) { 312 if (ret > 0)
333 if (path->slots[0] == 0)
334 return NULL;
335 path->slots[0]--;
336 }
337
338 leaf = path->nodes[0];
339 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
340
341 if (found_key.objectid != dir ||
342 btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY ||
343 found_key.offset != key.offset)
344 return NULL; 313 return NULL;
345 314
346 return btrfs_match_dir_item_name(root, path, name, name_len); 315 return btrfs_match_dir_item_name(root, path, name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1ac8db5dc0a..07b3ac662e1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -100,38 +100,83 @@ struct async_submit_bio {
100 struct btrfs_work work; 100 struct btrfs_work work;
101}; 101};
102 102
103/* These are used to set the lockdep class on the extent buffer locks. 103/*
104 * The class is set by the readpage_end_io_hook after the buffer has 104 * Lockdep class keys for extent_buffer->lock's in this root. For a given
105 * passed csum validation but before the pages are unlocked. 105 * eb, the lockdep key is determined by the btrfs_root it belongs to and
106 * the level the eb occupies in the tree.
107 *
108 * Different roots are used for different purposes and may nest inside each
109 * other and they require separate keysets. As lockdep keys should be
110 * static, assign keysets according to the purpose of the root as indicated
111 * by btrfs_root->objectid. This ensures that all special purpose roots
112 * have separate keysets.
106 * 113 *
107 * The lockdep class is also set by btrfs_init_new_buffer on freshly 114 * Lock-nesting across peer nodes is always done with the immediate parent
108 * allocated blocks. 115 * node locked thus preventing deadlock. As lockdep doesn't know this, use
116 * subclass to avoid triggering lockdep warning in such cases.
109 * 117 *
110 * The class is based on the level in the tree block, which allows lockdep 118 * The key is set by the readpage_end_io_hook after the buffer has passed
111 * to know that lower nodes nest inside the locks of higher nodes. 119 * csum validation but before the pages are unlocked. It is also set by
120 * btrfs_init_new_buffer on freshly allocated blocks.
112 * 121 *
113 * We also add a check to make sure the highest level of the tree is 122 * We also add a check to make sure the highest level of the tree is the
114 * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this 123 * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code
115 * code needs update as well. 124 * needs update as well.
116 */ 125 */
117#ifdef CONFIG_DEBUG_LOCK_ALLOC 126#ifdef CONFIG_DEBUG_LOCK_ALLOC
118# if BTRFS_MAX_LEVEL != 8 127# if BTRFS_MAX_LEVEL != 8
119# error 128# error
120# endif 129# endif
121static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; 130
122static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { 131static struct btrfs_lockdep_keyset {
123 /* leaf */ 132 u64 id; /* root objectid */
124 "btrfs-extent-00", 133 const char *name_stem; /* lock name stem */
125 "btrfs-extent-01", 134 char names[BTRFS_MAX_LEVEL + 1][20];
126 "btrfs-extent-02", 135 struct lock_class_key keys[BTRFS_MAX_LEVEL + 1];
127 "btrfs-extent-03", 136} btrfs_lockdep_keysets[] = {
128 "btrfs-extent-04", 137 { .id = BTRFS_ROOT_TREE_OBJECTID, .name_stem = "root" },
129 "btrfs-extent-05", 138 { .id = BTRFS_EXTENT_TREE_OBJECTID, .name_stem = "extent" },
130 "btrfs-extent-06", 139 { .id = BTRFS_CHUNK_TREE_OBJECTID, .name_stem = "chunk" },
131 "btrfs-extent-07", 140 { .id = BTRFS_DEV_TREE_OBJECTID, .name_stem = "dev" },
132 /* highest possible level */ 141 { .id = BTRFS_FS_TREE_OBJECTID, .name_stem = "fs" },
133 "btrfs-extent-08", 142 { .id = BTRFS_CSUM_TREE_OBJECTID, .name_stem = "csum" },
143 { .id = BTRFS_ORPHAN_OBJECTID, .name_stem = "orphan" },
144 { .id = BTRFS_TREE_LOG_OBJECTID, .name_stem = "log" },
145 { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" },
146 { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" },
147 { .id = 0, .name_stem = "tree" },
134}; 148};
149
150void __init btrfs_init_lockdep(void)
151{
152 int i, j;
153
154 /* initialize lockdep class names */
155 for (i = 0; i < ARRAY_SIZE(btrfs_lockdep_keysets); i++) {
156 struct btrfs_lockdep_keyset *ks = &btrfs_lockdep_keysets[i];
157
158 for (j = 0; j < ARRAY_SIZE(ks->names); j++)
159 snprintf(ks->names[j], sizeof(ks->names[j]),
160 "btrfs-%s-%02d", ks->name_stem, j);
161 }
162}
163
164void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb,
165 int level)
166{
167 struct btrfs_lockdep_keyset *ks;
168
169 BUG_ON(level >= ARRAY_SIZE(ks->keys));
170
171 /* find the matching keyset, id 0 is the default entry */
172 for (ks = btrfs_lockdep_keysets; ks->id; ks++)
173 if (ks->id == objectid)
174 break;
175
176 lockdep_set_class_and_name(&eb->lock,
177 &ks->keys[level], ks->names[level]);
178}
179
135#endif 180#endif
136 181
137/* 182/*
@@ -217,7 +262,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
217 unsigned long len; 262 unsigned long len;
218 unsigned long cur_len; 263 unsigned long cur_len;
219 unsigned long offset = BTRFS_CSUM_SIZE; 264 unsigned long offset = BTRFS_CSUM_SIZE;
220 char *map_token = NULL;
221 char *kaddr; 265 char *kaddr;
222 unsigned long map_start; 266 unsigned long map_start;
223 unsigned long map_len; 267 unsigned long map_len;
@@ -228,8 +272,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
228 len = buf->len - offset; 272 len = buf->len - offset;
229 while (len > 0) { 273 while (len > 0) {
230 err = map_private_extent_buffer(buf, offset, 32, 274 err = map_private_extent_buffer(buf, offset, 32,
231 &map_token, &kaddr, 275 &kaddr, &map_start, &map_len);
232 &map_start, &map_len, KM_USER0);
233 if (err) 276 if (err)
234 return 1; 277 return 1;
235 cur_len = min(len, map_len - (offset - map_start)); 278 cur_len = min(len, map_len - (offset - map_start));
@@ -237,7 +280,6 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
237 crc, cur_len); 280 crc, cur_len);
238 len -= cur_len; 281 len -= cur_len;
239 offset += cur_len; 282 offset += cur_len;
240 unmap_extent_buffer(buf, map_token, KM_USER0);
241 } 283 }
242 if (csum_size > sizeof(inline_result)) { 284 if (csum_size > sizeof(inline_result)) {
243 result = kzalloc(csum_size * sizeof(char), GFP_NOFS); 285 result = kzalloc(csum_size * sizeof(char), GFP_NOFS);
@@ -494,15 +536,6 @@ static noinline int check_leaf(struct btrfs_root *root,
494 return 0; 536 return 0;
495} 537}
496 538
497#ifdef CONFIG_DEBUG_LOCK_ALLOC
498void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
499{
500 lockdep_set_class_and_name(&eb->lock,
501 &btrfs_eb_class[level],
502 btrfs_eb_name[level]);
503}
504#endif
505
506static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, 539static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
507 struct extent_state *state) 540 struct extent_state *state)
508{ 541{
@@ -553,7 +586,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
553 } 586 }
554 found_level = btrfs_header_level(eb); 587 found_level = btrfs_header_level(eb);
555 588
556 btrfs_set_buffer_lockdep_class(eb, found_level); 589 btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
590 eb, found_level);
557 591
558 ret = csum_tree_block(root, eb, 1); 592 ret = csum_tree_block(root, eb, 1);
559 if (ret) { 593 if (ret) {
@@ -1077,12 +1111,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
1077 init_completion(&root->kobj_unregister); 1111 init_completion(&root->kobj_unregister);
1078 root->defrag_running = 0; 1112 root->defrag_running = 0;
1079 root->root_key.objectid = objectid; 1113 root->root_key.objectid = objectid;
1080 root->anon_super.s_root = NULL; 1114 root->anon_dev = 0;
1081 root->anon_super.s_dev = 0;
1082 INIT_LIST_HEAD(&root->anon_super.s_list);
1083 INIT_LIST_HEAD(&root->anon_super.s_instances);
1084 init_rwsem(&root->anon_super.s_umount);
1085
1086 return 0; 1115 return 0;
1087} 1116}
1088 1117
@@ -1311,7 +1340,7 @@ again:
1311 spin_lock_init(&root->cache_lock); 1340 spin_lock_init(&root->cache_lock);
1312 init_waitqueue_head(&root->cache_wait); 1341 init_waitqueue_head(&root->cache_wait);
1313 1342
1314 ret = set_anon_super(&root->anon_super, NULL); 1343 ret = get_anon_bdev(&root->anon_dev);
1315 if (ret) 1344 if (ret)
1316 goto fail; 1345 goto fail;
1317 1346
@@ -1603,7 +1632,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1603 goto fail_bdi; 1632 goto fail_bdi;
1604 } 1633 }
1605 1634
1606 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS; 1635 mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
1607 1636
1608 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1637 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1609 INIT_LIST_HEAD(&fs_info->trans_list); 1638 INIT_LIST_HEAD(&fs_info->trans_list);
@@ -1807,6 +1836,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1807 fs_info->thread_pool_size), 1836 fs_info->thread_pool_size),
1808 &fs_info->generic_worker); 1837 &fs_info->generic_worker);
1809 1838
1839 btrfs_init_workers(&fs_info->caching_workers, "cache",
1840 2, &fs_info->generic_worker);
1841
1810 /* a higher idle thresh on the submit workers makes it much more 1842 /* a higher idle thresh on the submit workers makes it much more
1811 * likely that bios will be send down in a sane order to the 1843 * likely that bios will be send down in a sane order to the
1812 * devices 1844 * devices
@@ -1860,6 +1892,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1860 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1892 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1861 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1862 btrfs_start_workers(&fs_info->delayed_workers, 1); 1894 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1);
1863 1896
1864 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1865 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2117,6 +2150,7 @@ fail_sb_buffer:
2117 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2150 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2118 btrfs_stop_workers(&fs_info->submit_workers); 2151 btrfs_stop_workers(&fs_info->submit_workers);
2119 btrfs_stop_workers(&fs_info->delayed_workers); 2152 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers);
2120fail_alloc: 2154fail_alloc:
2121 kfree(fs_info->delayed_root); 2155 kfree(fs_info->delayed_root);
2122fail_iput: 2156fail_iput:
@@ -2393,10 +2427,8 @@ static void free_fs_root(struct btrfs_root *root)
2393{ 2427{
2394 iput(root->cache_inode); 2428 iput(root->cache_inode);
2395 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 2429 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2396 if (root->anon_super.s_dev) { 2430 if (root->anon_dev)
2397 down_write(&root->anon_super.s_umount); 2431 free_anon_bdev(root->anon_dev);
2398 kill_anon_super(&root->anon_super);
2399 }
2400 free_extent_buffer(root->node); 2432 free_extent_buffer(root->node);
2401 free_extent_buffer(root->commit_root); 2433 free_extent_buffer(root->commit_root);
2402 kfree(root->free_ino_ctl); 2434 kfree(root->free_ino_ctl);
@@ -2584,6 +2616,7 @@ int close_ctree(struct btrfs_root *root)
2584 btrfs_stop_workers(&fs_info->endio_freespace_worker); 2616 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2585 btrfs_stop_workers(&fs_info->submit_workers); 2617 btrfs_stop_workers(&fs_info->submit_workers);
2586 btrfs_stop_workers(&fs_info->delayed_workers); 2618 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers);
2587 2620
2588 btrfs_close_devices(fs_info->fs_devices); 2621 btrfs_close_devices(fs_info->fs_devices);
2589 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2622 btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index a0b610a67aa..bec3ea4bd67 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,10 +87,14 @@ int btree_lock_page_hook(struct page *page);
87 87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); 90void btrfs_init_lockdep(void);
91void btrfs_set_buffer_lockdep_class(u64 objectid,
92 struct extent_buffer *eb, int level);
91#else 93#else
92static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, 94static inline void btrfs_init_lockdep(void)
93 int level) 95{ }
96static inline void btrfs_set_buffer_lockdep_class(u64 objectid,
97 struct extent_buffer *eb, int level)
94{ 98{
95} 99}
96#endif 100#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 71cd456fdb6..f5be06a2462 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -320,12 +320,12 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
320 return total_added; 320 return total_added;
321} 321}
322 322
323static int caching_kthread(void *data) 323static noinline void caching_thread(struct btrfs_work *work)
324{ 324{
325 struct btrfs_block_group_cache *block_group = data; 325 struct btrfs_block_group_cache *block_group;
326 struct btrfs_fs_info *fs_info = block_group->fs_info; 326 struct btrfs_fs_info *fs_info;
327 struct btrfs_caching_control *caching_ctl = block_group->caching_ctl; 327 struct btrfs_caching_control *caching_ctl;
328 struct btrfs_root *extent_root = fs_info->extent_root; 328 struct btrfs_root *extent_root;
329 struct btrfs_path *path; 329 struct btrfs_path *path;
330 struct extent_buffer *leaf; 330 struct extent_buffer *leaf;
331 struct btrfs_key key; 331 struct btrfs_key key;
@@ -334,9 +334,14 @@ static int caching_kthread(void *data)
334 u32 nritems; 334 u32 nritems;
335 int ret = 0; 335 int ret = 0;
336 336
337 caching_ctl = container_of(work, struct btrfs_caching_control, work);
338 block_group = caching_ctl->block_group;
339 fs_info = block_group->fs_info;
340 extent_root = fs_info->extent_root;
341
337 path = btrfs_alloc_path(); 342 path = btrfs_alloc_path();
338 if (!path) 343 if (!path)
339 return -ENOMEM; 344 goto out;
340 345
341 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 346 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
342 347
@@ -433,13 +438,11 @@ err:
433 free_excluded_extents(extent_root, block_group); 438 free_excluded_extents(extent_root, block_group);
434 439
435 mutex_unlock(&caching_ctl->mutex); 440 mutex_unlock(&caching_ctl->mutex);
441out:
436 wake_up(&caching_ctl->wait); 442 wake_up(&caching_ctl->wait);
437 443
438 put_caching_control(caching_ctl); 444 put_caching_control(caching_ctl);
439 atomic_dec(&block_group->space_info->caching_threads);
440 btrfs_put_block_group(block_group); 445 btrfs_put_block_group(block_group);
441
442 return 0;
443} 446}
444 447
445static int cache_block_group(struct btrfs_block_group_cache *cache, 448static int cache_block_group(struct btrfs_block_group_cache *cache,
@@ -449,7 +452,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
449{ 452{
450 struct btrfs_fs_info *fs_info = cache->fs_info; 453 struct btrfs_fs_info *fs_info = cache->fs_info;
451 struct btrfs_caching_control *caching_ctl; 454 struct btrfs_caching_control *caching_ctl;
452 struct task_struct *tsk;
453 int ret = 0; 455 int ret = 0;
454 456
455 smp_mb(); 457 smp_mb();
@@ -501,6 +503,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
501 caching_ctl->progress = cache->key.objectid; 503 caching_ctl->progress = cache->key.objectid;
502 /* one for caching kthread, one for caching block group list */ 504 /* one for caching kthread, one for caching block group list */
503 atomic_set(&caching_ctl->count, 2); 505 atomic_set(&caching_ctl->count, 2);
506 caching_ctl->work.func = caching_thread;
504 507
505 spin_lock(&cache->lock); 508 spin_lock(&cache->lock);
506 if (cache->cached != BTRFS_CACHE_NO) { 509 if (cache->cached != BTRFS_CACHE_NO) {
@@ -516,16 +519,9 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
516 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 519 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
517 up_write(&fs_info->extent_commit_sem); 520 up_write(&fs_info->extent_commit_sem);
518 521
519 atomic_inc(&cache->space_info->caching_threads);
520 btrfs_get_block_group(cache); 522 btrfs_get_block_group(cache);
521 523
522 tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", 524 btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
523 cache->key.objectid);
524 if (IS_ERR(tsk)) {
525 ret = PTR_ERR(tsk);
526 printk(KERN_ERR "error running thread %d\n", ret);
527 BUG();
528 }
529 525
530 return ret; 526 return ret;
531} 527}
@@ -667,7 +663,9 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
667 struct btrfs_path *path; 663 struct btrfs_path *path;
668 664
669 path = btrfs_alloc_path(); 665 path = btrfs_alloc_path();
670 BUG_ON(!path); 666 if (!path)
667 return -ENOMEM;
668
671 key.objectid = start; 669 key.objectid = start;
672 key.offset = len; 670 key.offset = len;
673 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); 671 btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
@@ -1784,6 +1782,9 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1784 1782
1785 1783
1786 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1784 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard)
1786 continue;
1787
1787 ret = btrfs_issue_discard(stripe->dev->bdev, 1788 ret = btrfs_issue_discard(stripe->dev->bdev,
1788 stripe->physical, 1789 stripe->physical,
1789 stripe->length); 1790 stripe->length);
@@ -1791,11 +1792,16 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1791 discarded_bytes += stripe->length; 1792 discarded_bytes += stripe->length;
1792 else if (ret != -EOPNOTSUPP) 1793 else if (ret != -EOPNOTSUPP)
1793 break; 1794 break;
1795
1796 /*
1797 * Just in case we get back EOPNOTSUPP for some reason,
1798 * just ignore the return value so we don't screw up
1799 * people calling discard_extent.
1800 */
1801 ret = 0;
1794 } 1802 }
1795 kfree(multi); 1803 kfree(multi);
1796 } 1804 }
1797 if (discarded_bytes && ret == -EOPNOTSUPP)
1798 ret = 0;
1799 1805
1800 if (actual_bytes) 1806 if (actual_bytes)
1801 *actual_bytes = discarded_bytes; 1807 *actual_bytes = discarded_bytes;
@@ -2932,9 +2938,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2932 found->full = 0; 2938 found->full = 0;
2933 found->force_alloc = CHUNK_ALLOC_NO_FORCE; 2939 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
2934 found->chunk_alloc = 0; 2940 found->chunk_alloc = 0;
2941 found->flush = 0;
2942 init_waitqueue_head(&found->wait);
2935 *space_info = found; 2943 *space_info = found;
2936 list_add_rcu(&found->list, &info->space_info); 2944 list_add_rcu(&found->list, &info->space_info);
2937 atomic_set(&found->caching_threads, 0);
2938 return 0; 2945 return 0;
2939} 2946}
2940 2947
@@ -3275,6 +3282,9 @@ again:
3275 } 3282 }
3276 3283
3277 ret = btrfs_alloc_chunk(trans, extent_root, flags); 3284 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3285 if (ret < 0 && ret != -ENOSPC)
3286 goto out;
3287
3278 spin_lock(&space_info->lock); 3288 spin_lock(&space_info->lock);
3279 if (ret) 3289 if (ret)
3280 space_info->full = 1; 3290 space_info->full = 1;
@@ -3284,6 +3294,7 @@ again:
3284 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; 3294 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3285 space_info->chunk_alloc = 0; 3295 space_info->chunk_alloc = 0;
3286 spin_unlock(&space_info->lock); 3296 spin_unlock(&space_info->lock);
3297out:
3287 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3298 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3288 return ret; 3299 return ret;
3289} 3300}
@@ -3314,6 +3325,14 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3314 if (reserved == 0) 3325 if (reserved == 0)
3315 return 0; 3326 return 0;
3316 3327
3328 smp_mb();
3329 if (root->fs_info->delalloc_bytes == 0) {
3330 if (trans)
3331 return 0;
3332 btrfs_wait_ordered_extents(root, 0, 0);
3333 return 0;
3334 }
3335
3317 max_reclaim = min(reserved, to_reclaim); 3336 max_reclaim = min(reserved, to_reclaim);
3318 3337
3319 while (loops < 1024) { 3338 while (loops < 1024) {
@@ -3356,6 +3375,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 } 3375 }
3357 3376
3358 } 3377 }
3378 if (reclaimed >= to_reclaim && !trans)
3379 btrfs_wait_ordered_extents(root, 0, 0);
3359 return reclaimed >= to_reclaim; 3380 return reclaimed >= to_reclaim;
3360} 3381}
3361 3382
@@ -3380,15 +3401,36 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3380 u64 num_bytes = orig_bytes; 3401 u64 num_bytes = orig_bytes;
3381 int retries = 0; 3402 int retries = 0;
3382 int ret = 0; 3403 int ret = 0;
3383 bool reserved = false;
3384 bool committed = false; 3404 bool committed = false;
3405 bool flushing = false;
3385 3406
3386again: 3407again:
3387 ret = -ENOSPC; 3408 ret = 0;
3388 if (reserved)
3389 num_bytes = 0;
3390
3391 spin_lock(&space_info->lock); 3409 spin_lock(&space_info->lock);
3410 /*
3411 * We only want to wait if somebody other than us is flushing and we are
3412 * actually alloed to flush.
3413 */
3414 while (flush && !flushing && space_info->flush) {
3415 spin_unlock(&space_info->lock);
3416 /*
3417 * If we have a trans handle we can't wait because the flusher
3418 * may have to commit the transaction, which would mean we would
3419 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open.
3421 */
3422 if (trans)
3423 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush);
3426 /* Must have been interrupted, return */
3427 if (ret)
3428 return -EINTR;
3429
3430 spin_lock(&space_info->lock);
3431 }
3432
3433 ret = -ENOSPC;
3392 unused = space_info->bytes_used + space_info->bytes_reserved + 3434 unused = space_info->bytes_used + space_info->bytes_reserved +
3393 space_info->bytes_pinned + space_info->bytes_readonly + 3435 space_info->bytes_pinned + space_info->bytes_readonly +
3394 space_info->bytes_may_use; 3436 space_info->bytes_may_use;
@@ -3403,8 +3445,7 @@ again:
3403 if (unused <= space_info->total_bytes) { 3445 if (unused <= space_info->total_bytes) {
3404 unused = space_info->total_bytes - unused; 3446 unused = space_info->total_bytes - unused;
3405 if (unused >= num_bytes) { 3447 if (unused >= num_bytes) {
3406 if (!reserved) 3448 space_info->bytes_reserved += orig_bytes;
3407 space_info->bytes_reserved += orig_bytes;
3408 ret = 0; 3449 ret = 0;
3409 } else { 3450 } else {
3410 /* 3451 /*
@@ -3429,17 +3470,14 @@ again:
3429 * to reclaim space we can actually use it instead of somebody else 3470 * to reclaim space we can actually use it instead of somebody else
3430 * stealing it from us. 3471 * stealing it from us.
3431 */ 3472 */
3432 if (ret && !reserved) { 3473 if (ret && flush) {
3433 space_info->bytes_reserved += orig_bytes; 3474 flushing = true;
3434 reserved = true; 3475 space_info->flush = 1;
3435 } 3476 }
3436 3477
3437 spin_unlock(&space_info->lock); 3478 spin_unlock(&space_info->lock);
3438 3479
3439 if (!ret) 3480 if (!ret || !flush)
3440 return 0;
3441
3442 if (!flush)
3443 goto out; 3481 goto out;
3444 3482
3445 /* 3483 /*
@@ -3447,11 +3485,11 @@ again:
3447 * metadata until after the IO is completed. 3485 * metadata until after the IO is completed.
3448 */ 3486 */
3449 ret = shrink_delalloc(trans, root, num_bytes, 1); 3487 ret = shrink_delalloc(trans, root, num_bytes, 1);
3450 if (ret > 0) 3488 if (ret < 0)
3451 return 0;
3452 else if (ret < 0)
3453 goto out; 3489 goto out;
3454 3490
3491 ret = 0;
3492
3455 /* 3493 /*
3456 * So if we were overcommitted it's possible that somebody else flushed 3494 * So if we were overcommitted it's possible that somebody else flushed
3457 * out enough space and we simply didn't have enough space to reclaim, 3495 * out enough space and we simply didn't have enough space to reclaim,
@@ -3462,11 +3500,11 @@ again:
3462 goto again; 3500 goto again;
3463 } 3501 }
3464 3502
3465 spin_lock(&space_info->lock);
3466 /* 3503 /*
3467 * Not enough space to be reclaimed, don't bother committing the 3504 * Not enough space to be reclaimed, don't bother committing the
3468 * transaction. 3505 * transaction.
3469 */ 3506 */
3507 spin_lock(&space_info->lock);
3470 if (space_info->bytes_pinned < orig_bytes) 3508 if (space_info->bytes_pinned < orig_bytes)
3471 ret = -ENOSPC; 3509 ret = -ENOSPC;
3472 spin_unlock(&space_info->lock); 3510 spin_unlock(&space_info->lock);
@@ -3474,10 +3512,13 @@ again:
3474 goto out; 3512 goto out;
3475 3513
3476 ret = -EAGAIN; 3514 ret = -EAGAIN;
3477 if (trans || committed) 3515 if (trans)
3478 goto out; 3516 goto out;
3479 3517
3480 ret = -ENOSPC; 3518 ret = -ENOSPC;
3519 if (committed)
3520 goto out;
3521
3481 trans = btrfs_join_transaction(root); 3522 trans = btrfs_join_transaction(root);
3482 if (IS_ERR(trans)) 3523 if (IS_ERR(trans))
3483 goto out; 3524 goto out;
@@ -3489,12 +3530,12 @@ again:
3489 } 3530 }
3490 3531
3491out: 3532out:
3492 if (reserved) { 3533 if (flushing) {
3493 spin_lock(&space_info->lock); 3534 spin_lock(&space_info->lock);
3494 space_info->bytes_reserved -= orig_bytes; 3535 space_info->flush = 0;
3536 wake_up_all(&space_info->wait);
3495 spin_unlock(&space_info->lock); 3537 spin_unlock(&space_info->lock);
3496 } 3538 }
3497
3498 return ret; 3539 return ret;
3499} 3540}
3500 3541
@@ -3704,7 +3745,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3704 if (commit_trans) { 3745 if (commit_trans) {
3705 if (trans) 3746 if (trans)
3706 return -EAGAIN; 3747 return -EAGAIN;
3707
3708 trans = btrfs_join_transaction(root); 3748 trans = btrfs_join_transaction(root);
3709 BUG_ON(IS_ERR(trans)); 3749 BUG_ON(IS_ERR(trans));
3710 ret = btrfs_commit_transaction(trans, root); 3750 ret = btrfs_commit_transaction(trans, root);
@@ -3874,26 +3914,6 @@ int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3874 return 0; 3914 return 0;
3875} 3915}
3876 3916
3877int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3878 struct btrfs_root *root,
3879 int num_items)
3880{
3881 u64 num_bytes;
3882 int ret;
3883
3884 if (num_items == 0 || root->fs_info->chunk_root == root)
3885 return 0;
3886
3887 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
3888 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3889 num_bytes);
3890 if (!ret) {
3891 trans->bytes_reserved += num_bytes;
3892 trans->block_rsv = &root->fs_info->trans_block_rsv;
3893 }
3894 return ret;
3895}
3896
3897void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3898 struct btrfs_root *root) 3918 struct btrfs_root *root)
3899{ 3919{
@@ -3944,6 +3964,30 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3944 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3945} 3965}
3946 3966
3967static unsigned drop_outstanding_extent(struct inode *inode)
3968{
3969 unsigned dropped_extents = 0;
3970
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--;
3974
3975 /*
3976 * If we have more or the same amount of outsanding extents than we have
3977 * reserved then we need to leave the reserved extents count alone.
3978 */
3979 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents)
3981 goto out;
3982
3983 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out:
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents;
3989}
3990
3947static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3948{ 3992{
3949 return num_bytes >>= 3; 3993 return num_bytes >>= 3;
@@ -3953,9 +3997,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3953{ 3997{
3954 struct btrfs_root *root = BTRFS_I(inode)->root; 3998 struct btrfs_root *root = BTRFS_I(inode)->root;
3955 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3956 u64 to_reserve; 4000 u64 to_reserve = 0;
3957 int nr_extents; 4001 unsigned nr_extents = 0;
3958 int reserved_extents;
3959 int ret; 4002 int ret;
3960 4003
3961 if (btrfs_transaction_in_commit(root->fs_info)) 4004 if (btrfs_transaction_in_commit(root->fs_info))
@@ -3963,66 +4006,49 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3963 4006
3964 num_bytes = ALIGN(num_bytes, root->sectorsize); 4007 num_bytes = ALIGN(num_bytes, root->sectorsize);
3965 4008
3966 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 4009 spin_lock(&BTRFS_I(inode)->lock);
3967 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents); 4010 BTRFS_I(inode)->outstanding_extents++;
4011
4012 if (BTRFS_I(inode)->outstanding_extents >
4013 BTRFS_I(inode)->reserved_extents) {
4014 nr_extents = BTRFS_I(inode)->outstanding_extents -
4015 BTRFS_I(inode)->reserved_extents;
4016 BTRFS_I(inode)->reserved_extents += nr_extents;
3968 4017
3969 if (nr_extents > reserved_extents) {
3970 nr_extents -= reserved_extents;
3971 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
3972 } else {
3973 nr_extents = 0;
3974 to_reserve = 0;
3975 } 4019 }
4020 spin_unlock(&BTRFS_I(inode)->lock);
3976 4021
3977 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4022 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3978 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1); 4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3979 if (ret) 4024 if (ret) {
4025 unsigned dropped;
4026 /*
4027 * We don't need the return value since our reservation failed,
4028 * we just need to clean up our counter.
4029 */
4030 dropped = drop_outstanding_extent(inode);
4031 WARN_ON(dropped > 1);
3980 return ret; 4032 return ret;
3981 4033 }
3982 atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
3983 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3984 4034
3985 block_rsv_add_bytes(block_rsv, to_reserve, 1); 4035 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3986 4036
3987 if (block_rsv->size > 512 * 1024 * 1024)
3988 shrink_delalloc(NULL, root, to_reserve, 0);
3989
3990 return 0; 4037 return 0;
3991} 4038}
3992 4039
3993void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3994{ 4041{
3995 struct btrfs_root *root = BTRFS_I(inode)->root; 4042 struct btrfs_root *root = BTRFS_I(inode)->root;
3996 u64 to_free; 4043 u64 to_free = 0;
3997 int nr_extents; 4044 unsigned dropped;
3998 int reserved_extents;
3999 4045
4000 num_bytes = ALIGN(num_bytes, root->sectorsize); 4046 num_bytes = ALIGN(num_bytes, root->sectorsize);
4001 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4047 dropped = drop_outstanding_extent(inode);
4002 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
4003
4004 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
4005 do {
4006 int old, new;
4007
4008 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4009 if (nr_extents >= reserved_extents) {
4010 nr_extents = 0;
4011 break;
4012 }
4013 old = reserved_extents;
4014 nr_extents = reserved_extents - nr_extents;
4015 new = reserved_extents - nr_extents;
4016 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
4017 reserved_extents, new);
4018 if (likely(old == reserved_extents))
4019 break;
4020 reserved_extents = old;
4021 } while (1);
4022 4048
4023 to_free = calc_csum_metadata_size(inode, num_bytes); 4049 to_free = calc_csum_metadata_size(inode, num_bytes);
4024 if (nr_extents > 0) 4050 if (dropped > 0)
4025 to_free += btrfs_calc_trans_metadata_size(root, nr_extents); 4051 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4026 4052
4027 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4053 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4028 to_free); 4054 to_free);
@@ -4444,7 +4470,9 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4444 printk(KERN_ERR "umm, got %d back from search" 4470 printk(KERN_ERR "umm, got %d back from search"
4445 ", was looking for %llu\n", ret, 4471 ", was looking for %llu\n", ret,
4446 (unsigned long long)bytenr); 4472 (unsigned long long)bytenr);
4447 btrfs_print_leaf(extent_root, path->nodes[0]); 4473 if (ret > 0)
4474 btrfs_print_leaf(extent_root,
4475 path->nodes[0]);
4448 } 4476 }
4449 BUG_ON(ret); 4477 BUG_ON(ret);
4450 extent_slot = path->slots[0]; 4478 extent_slot = path->slots[0];
@@ -4990,14 +5018,10 @@ have_block_group:
4990 } 5018 }
4991 5019
4992 /* 5020 /*
4993 * We only want to start kthread caching if we are at 5021 * The caching workers are limited to 2 threads, so we
4994 * the point where we will wait for caching to make 5022 * can queue as much work as we care to.
4995 * progress, or if our ideal search is over and we've
4996 * found somebody to start caching.
4997 */ 5023 */
4998 if (loop > LOOP_CACHING_NOWAIT || 5024 if (loop > LOOP_FIND_IDEAL) {
4999 (loop > LOOP_FIND_IDEAL &&
5000 atomic_read(&space_info->caching_threads) < 2)) {
5001 ret = cache_block_group(block_group, trans, 5025 ret = cache_block_group(block_group, trans,
5002 orig_root, 0); 5026 orig_root, 0);
5003 BUG_ON(ret); 5027 BUG_ON(ret);
@@ -5065,7 +5089,9 @@ have_block_group:
5065 * group is does point to and try again 5089 * group is does point to and try again
5066 */ 5090 */
5067 if (!last_ptr_loop && last_ptr->block_group && 5091 if (!last_ptr_loop && last_ptr->block_group &&
5068 last_ptr->block_group != block_group) { 5092 last_ptr->block_group != block_group &&
5093 index <=
5094 get_block_group_index(last_ptr->block_group)) {
5069 5095
5070 btrfs_put_block_group(block_group); 5096 btrfs_put_block_group(block_group);
5071 block_group = last_ptr->block_group; 5097 block_group = last_ptr->block_group;
@@ -5219,8 +5245,7 @@ loop:
5219 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 5245 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
5220 found_uncached_bg = false; 5246 found_uncached_bg = false;
5221 loop++; 5247 loop++;
5222 if (!ideal_cache_percent && 5248 if (!ideal_cache_percent)
5223 atomic_read(&space_info->caching_threads))
5224 goto search; 5249 goto search;
5225 5250
5226 /* 5251 /*
@@ -5494,7 +5519,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
5494 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref); 5519 u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
5495 5520
5496 path = btrfs_alloc_path(); 5521 path = btrfs_alloc_path();
5497 BUG_ON(!path); 5522 if (!path)
5523 return -ENOMEM;
5498 5524
5499 path->leave_spinning = 1; 5525 path->leave_spinning = 1;
5500 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5526 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5623,7 +5649,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
5623 if (!buf) 5649 if (!buf)
5624 return ERR_PTR(-ENOMEM); 5650 return ERR_PTR(-ENOMEM);
5625 btrfs_set_header_generation(buf, trans->transid); 5651 btrfs_set_header_generation(buf, trans->transid);
5626 btrfs_set_buffer_lockdep_class(buf, level); 5652 btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
5627 btrfs_tree_lock(buf); 5653 btrfs_tree_lock(buf);
5628 clean_tree_block(trans, root, buf); 5654 clean_tree_block(trans, root, buf);
5629 5655
@@ -5910,7 +5936,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5910 return 1; 5936 return 1;
5911 5937
5912 if (path->locks[level] && !wc->keep_locks) { 5938 if (path->locks[level] && !wc->keep_locks) {
5913 btrfs_tree_unlock(eb); 5939 btrfs_tree_unlock_rw(eb, path->locks[level]);
5914 path->locks[level] = 0; 5940 path->locks[level] = 0;
5915 } 5941 }
5916 return 0; 5942 return 0;
@@ -5934,7 +5960,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
5934 * keep the tree lock 5960 * keep the tree lock
5935 */ 5961 */
5936 if (path->locks[level] && level > 0) { 5962 if (path->locks[level] && level > 0) {
5937 btrfs_tree_unlock(eb); 5963 btrfs_tree_unlock_rw(eb, path->locks[level]);
5938 path->locks[level] = 0; 5964 path->locks[level] = 0;
5939 } 5965 }
5940 return 0; 5966 return 0;
@@ -6047,7 +6073,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6047 BUG_ON(level != btrfs_header_level(next)); 6073 BUG_ON(level != btrfs_header_level(next));
6048 path->nodes[level] = next; 6074 path->nodes[level] = next;
6049 path->slots[level] = 0; 6075 path->slots[level] = 0;
6050 path->locks[level] = 1; 6076 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6051 wc->level = level; 6077 wc->level = level;
6052 if (wc->level == 1) 6078 if (wc->level == 1)
6053 wc->reada_slot = 0; 6079 wc->reada_slot = 0;
@@ -6118,7 +6144,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6118 BUG_ON(level == 0); 6144 BUG_ON(level == 0);
6119 btrfs_tree_lock(eb); 6145 btrfs_tree_lock(eb);
6120 btrfs_set_lock_blocking(eb); 6146 btrfs_set_lock_blocking(eb);
6121 path->locks[level] = 1; 6147 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6122 6148
6123 ret = btrfs_lookup_extent_info(trans, root, 6149 ret = btrfs_lookup_extent_info(trans, root,
6124 eb->start, eb->len, 6150 eb->start, eb->len,
@@ -6127,8 +6153,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6127 BUG_ON(ret); 6153 BUG_ON(ret);
6128 BUG_ON(wc->refs[level] == 0); 6154 BUG_ON(wc->refs[level] == 0);
6129 if (wc->refs[level] == 1) { 6155 if (wc->refs[level] == 1) {
6130 btrfs_tree_unlock(eb); 6156 btrfs_tree_unlock_rw(eb, path->locks[level]);
6131 path->locks[level] = 0;
6132 return 1; 6157 return 1;
6133 } 6158 }
6134 } 6159 }
@@ -6150,7 +6175,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6150 btrfs_header_generation(eb) == trans->transid) { 6175 btrfs_header_generation(eb) == trans->transid) {
6151 btrfs_tree_lock(eb); 6176 btrfs_tree_lock(eb);
6152 btrfs_set_lock_blocking(eb); 6177 btrfs_set_lock_blocking(eb);
6153 path->locks[level] = 1; 6178 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6154 } 6179 }
6155 clean_tree_block(trans, root, eb); 6180 clean_tree_block(trans, root, eb);
6156 } 6181 }
@@ -6229,7 +6254,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6229 return 0; 6254 return 0;
6230 6255
6231 if (path->locks[level]) { 6256 if (path->locks[level]) {
6232 btrfs_tree_unlock(path->nodes[level]); 6257 btrfs_tree_unlock_rw(path->nodes[level],
6258 path->locks[level]);
6233 path->locks[level] = 0; 6259 path->locks[level] = 0;
6234 } 6260 }
6235 free_extent_buffer(path->nodes[level]); 6261 free_extent_buffer(path->nodes[level]);
@@ -6251,8 +6277,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6251 * also make sure backrefs for the shared block and all lower level 6277 * also make sure backrefs for the shared block and all lower level
6252 * blocks are properly updated. 6278 * blocks are properly updated.
6253 */ 6279 */
6254int btrfs_drop_snapshot(struct btrfs_root *root, 6280void btrfs_drop_snapshot(struct btrfs_root *root,
6255 struct btrfs_block_rsv *block_rsv, int update_ref) 6281 struct btrfs_block_rsv *block_rsv, int update_ref)
6256{ 6282{
6257 struct btrfs_path *path; 6283 struct btrfs_path *path;
6258 struct btrfs_trans_handle *trans; 6284 struct btrfs_trans_handle *trans;
@@ -6265,10 +6291,17 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6265 int level; 6291 int level;
6266 6292
6267 path = btrfs_alloc_path(); 6293 path = btrfs_alloc_path();
6268 BUG_ON(!path); 6294 if (!path) {
6295 err = -ENOMEM;
6296 goto out;
6297 }
6269 6298
6270 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6299 wc = kzalloc(sizeof(*wc), GFP_NOFS);
6271 BUG_ON(!wc); 6300 if (!wc) {
6301 btrfs_free_path(path);
6302 err = -ENOMEM;
6303 goto out;
6304 }
6272 6305
6273 trans = btrfs_start_transaction(tree_root, 0); 6306 trans = btrfs_start_transaction(tree_root, 0);
6274 BUG_ON(IS_ERR(trans)); 6307 BUG_ON(IS_ERR(trans));
@@ -6281,7 +6314,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6281 path->nodes[level] = btrfs_lock_root_node(root); 6314 path->nodes[level] = btrfs_lock_root_node(root);
6282 btrfs_set_lock_blocking(path->nodes[level]); 6315 btrfs_set_lock_blocking(path->nodes[level]);
6283 path->slots[level] = 0; 6316 path->slots[level] = 0;
6284 path->locks[level] = 1; 6317 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6285 memset(&wc->update_progress, 0, 6318 memset(&wc->update_progress, 0,
6286 sizeof(wc->update_progress)); 6319 sizeof(wc->update_progress));
6287 } else { 6320 } else {
@@ -6296,7 +6329,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6296 path->lowest_level = 0; 6329 path->lowest_level = 0;
6297 if (ret < 0) { 6330 if (ret < 0) {
6298 err = ret; 6331 err = ret;
6299 goto out; 6332 goto out_free;
6300 } 6333 }
6301 WARN_ON(ret > 0); 6334 WARN_ON(ret > 0);
6302 6335
@@ -6403,11 +6436,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6403 free_extent_buffer(root->commit_root); 6436 free_extent_buffer(root->commit_root);
6404 kfree(root); 6437 kfree(root);
6405 } 6438 }
6406out: 6439out_free:
6407 btrfs_end_transaction_throttle(trans, tree_root); 6440 btrfs_end_transaction_throttle(trans, tree_root);
6408 kfree(wc); 6441 kfree(wc);
6409 btrfs_free_path(path); 6442 btrfs_free_path(path);
6410 return err; 6443out:
6444 if (err)
6445 btrfs_std_error(root->fs_info, err);
6446 return;
6411} 6447}
6412 6448
6413/* 6449/*
@@ -6449,7 +6485,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6449 level = btrfs_header_level(node); 6485 level = btrfs_header_level(node);
6450 path->nodes[level] = node; 6486 path->nodes[level] = node;
6451 path->slots[level] = 0; 6487 path->slots[level] = 0;
6452 path->locks[level] = 1; 6488 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6453 6489
6454 wc->refs[parent_level] = 1; 6490 wc->refs[parent_level] = 1;
6455 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF; 6491 wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
@@ -6524,30 +6560,48 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
6524 return flags; 6560 return flags;
6525} 6561}
6526 6562
6527static int set_block_group_ro(struct btrfs_block_group_cache *cache) 6563static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6528{ 6564{
6529 struct btrfs_space_info *sinfo = cache->space_info; 6565 struct btrfs_space_info *sinfo = cache->space_info;
6530 u64 num_bytes; 6566 u64 num_bytes;
6567 u64 min_allocable_bytes;
6531 int ret = -ENOSPC; 6568 int ret = -ENOSPC;
6532 6569
6533 if (cache->ro) 6570
6534 return 0; 6571 /*
6572 * We need some metadata space and system metadata space for
6573 * allocating chunks in some corner cases until we force to set
6574 * it to be readonly.
6575 */
6576 if ((sinfo->flags &
6577 (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
6578 !force)
6579 min_allocable_bytes = 1 * 1024 * 1024;
6580 else
6581 min_allocable_bytes = 0;
6535 6582
6536 spin_lock(&sinfo->lock); 6583 spin_lock(&sinfo->lock);
6537 spin_lock(&cache->lock); 6584 spin_lock(&cache->lock);
6585
6586 if (cache->ro) {
6587 ret = 0;
6588 goto out;
6589 }
6590
6538 num_bytes = cache->key.offset - cache->reserved - cache->pinned - 6591 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
6539 cache->bytes_super - btrfs_block_group_used(&cache->item); 6592 cache->bytes_super - btrfs_block_group_used(&cache->item);
6540 6593
6541 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6542 sinfo->bytes_may_use + sinfo->bytes_readonly + 6595 sinfo->bytes_may_use + sinfo->bytes_readonly +
6543 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) { 6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <=
6597 sinfo->total_bytes) {
6544 sinfo->bytes_readonly += num_bytes; 6598 sinfo->bytes_readonly += num_bytes;
6545 sinfo->bytes_reserved += cache->reserved_pinned; 6599 sinfo->bytes_reserved += cache->reserved_pinned;
6546 cache->reserved_pinned = 0; 6600 cache->reserved_pinned = 0;
6547 cache->ro = 1; 6601 cache->ro = 1;
6548 ret = 0; 6602 ret = 0;
6549 } 6603 }
6550 6604out:
6551 spin_unlock(&cache->lock); 6605 spin_unlock(&cache->lock);
6552 spin_unlock(&sinfo->lock); 6606 spin_unlock(&sinfo->lock);
6553 return ret; 6607 return ret;
@@ -6571,7 +6625,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
6571 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 6625 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6572 CHUNK_ALLOC_FORCE); 6626 CHUNK_ALLOC_FORCE);
6573 6627
6574 ret = set_block_group_ro(cache); 6628 ret = set_block_group_ro(cache, 0);
6575 if (!ret) 6629 if (!ret)
6576 goto out; 6630 goto out;
6577 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 6631 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
@@ -6579,7 +6633,7 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
6579 CHUNK_ALLOC_FORCE); 6633 CHUNK_ALLOC_FORCE);
6580 if (ret < 0) 6634 if (ret < 0)
6581 goto out; 6635 goto out;
6582 ret = set_block_group_ro(cache); 6636 ret = set_block_group_ro(cache, 0);
6583out: 6637out:
6584 btrfs_end_transaction(trans, root); 6638 btrfs_end_transaction(trans, root);
6585 return ret; 6639 return ret;
@@ -6680,6 +6734,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6680 struct btrfs_space_info *space_info; 6734 struct btrfs_space_info *space_info;
6681 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 6735 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6682 struct btrfs_device *device; 6736 struct btrfs_device *device;
6737 u64 min_free;
6738 u64 dev_min = 1;
6739 u64 dev_nr = 0;
6740 int index;
6683 int full = 0; 6741 int full = 0;
6684 int ret = 0; 6742 int ret = 0;
6685 6743
@@ -6689,8 +6747,10 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6689 if (!block_group) 6747 if (!block_group)
6690 return -1; 6748 return -1;
6691 6749
6750 min_free = btrfs_block_group_used(&block_group->item);
6751
6692 /* no bytes used, we're good */ 6752 /* no bytes used, we're good */
6693 if (!btrfs_block_group_used(&block_group->item)) 6753 if (!min_free)
6694 goto out; 6754 goto out;
6695 6755
6696 space_info = block_group->space_info; 6756 space_info = block_group->space_info;
@@ -6706,10 +6766,9 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6706 * all of the extents from this block group. If we can, we're good 6766 * all of the extents from this block group. If we can, we're good
6707 */ 6767 */
6708 if ((space_info->total_bytes != block_group->key.offset) && 6768 if ((space_info->total_bytes != block_group->key.offset) &&
6709 (space_info->bytes_used + space_info->bytes_reserved + 6769 (space_info->bytes_used + space_info->bytes_reserved +
6710 space_info->bytes_pinned + space_info->bytes_readonly + 6770 space_info->bytes_pinned + space_info->bytes_readonly +
6711 btrfs_block_group_used(&block_group->item) < 6771 min_free < space_info->total_bytes)) {
6712 space_info->total_bytes)) {
6713 spin_unlock(&space_info->lock); 6772 spin_unlock(&space_info->lock);
6714 goto out; 6773 goto out;
6715 } 6774 }
@@ -6726,9 +6785,31 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6726 if (full) 6785 if (full)
6727 goto out; 6786 goto out;
6728 6787
6788 /*
6789 * index:
6790 * 0: raid10
6791 * 1: raid1
6792 * 2: dup
6793 * 3: raid0
6794 * 4: single
6795 */
6796 index = get_block_group_index(block_group);
6797 if (index == 0) {
6798 dev_min = 4;
6799 /* Divide by 2 */
6800 min_free >>= 1;
6801 } else if (index == 1) {
6802 dev_min = 2;
6803 } else if (index == 2) {
6804 /* Multiply by 2 */
6805 min_free <<= 1;
6806 } else if (index == 3) {
6807 dev_min = fs_devices->rw_devices;
6808 do_div(min_free, dev_min);
6809 }
6810
6729 mutex_lock(&root->fs_info->chunk_mutex); 6811 mutex_lock(&root->fs_info->chunk_mutex);
6730 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 6812 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
6731 u64 min_free = btrfs_block_group_used(&block_group->item);
6732 u64 dev_offset; 6813 u64 dev_offset;
6733 6814
6734 /* 6815 /*
@@ -6739,7 +6820,11 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
6739 ret = find_free_dev_extent(NULL, device, min_free, 6820 ret = find_free_dev_extent(NULL, device, min_free,
6740 &dev_offset, NULL); 6821 &dev_offset, NULL);
6741 if (!ret) 6822 if (!ret)
6823 dev_nr++;
6824
6825 if (dev_nr >= dev_min)
6742 break; 6826 break;
6827
6743 ret = -1; 6828 ret = -1;
6744 } 6829 }
6745 } 6830 }
@@ -7016,7 +7101,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7016 7101
7017 set_avail_alloc_bits(root->fs_info, cache->flags); 7102 set_avail_alloc_bits(root->fs_info, cache->flags);
7018 if (btrfs_chunk_readonly(root, cache->key.objectid)) 7103 if (btrfs_chunk_readonly(root, cache->key.objectid))
7019 set_block_group_ro(cache); 7104 set_block_group_ro(cache, 1);
7020 } 7105 }
7021 7106
7022 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { 7107 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
@@ -7030,9 +7115,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7030 * mirrored block groups. 7115 * mirrored block groups.
7031 */ 7116 */
7032 list_for_each_entry(cache, &space_info->block_groups[3], list) 7117 list_for_each_entry(cache, &space_info->block_groups[3], list)
7033 set_block_group_ro(cache); 7118 set_block_group_ro(cache, 1);
7034 list_for_each_entry(cache, &space_info->block_groups[4], list) 7119 list_for_each_entry(cache, &space_info->block_groups[4], list)
7035 set_block_group_ro(cache); 7120 set_block_group_ro(cache, 1);
7036 } 7121 }
7037 7122
7038 init_global_block_rsv(info); 7123 init_global_block_rsv(info);
@@ -7162,11 +7247,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7162 spin_unlock(&cluster->refill_lock); 7247 spin_unlock(&cluster->refill_lock);
7163 7248
7164 path = btrfs_alloc_path(); 7249 path = btrfs_alloc_path();
7165 BUG_ON(!path); 7250 if (!path) {
7251 ret = -ENOMEM;
7252 goto out;
7253 }
7166 7254
7167 inode = lookup_free_space_inode(root, block_group, path); 7255 inode = lookup_free_space_inode(root, block_group, path);
7168 if (!IS_ERR(inode)) { 7256 if (!IS_ERR(inode)) {
7169 btrfs_orphan_add(trans, inode); 7257 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret);
7170 clear_nlink(inode); 7259 clear_nlink(inode);
7171 /* One for the block groups ref */ 7260 /* One for the block groups ref */
7172 spin_lock(&block_group->lock); 7261 spin_lock(&block_group->lock);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7055d11c1ef..d418164a35f 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -254,14 +254,14 @@ static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
254 * 254 *
255 * This should be called with the tree lock held. 255 * This should be called with the tree lock held.
256 */ 256 */
257static int merge_state(struct extent_io_tree *tree, 257static void merge_state(struct extent_io_tree *tree,
258 struct extent_state *state) 258 struct extent_state *state)
259{ 259{
260 struct extent_state *other; 260 struct extent_state *other;
261 struct rb_node *other_node; 261 struct rb_node *other_node;
262 262
263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) 263 if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
264 return 0; 264 return;
265 265
266 other_node = rb_prev(&state->rb_node); 266 other_node = rb_prev(&state->rb_node);
267 if (other_node) { 267 if (other_node) {
@@ -281,26 +281,19 @@ static int merge_state(struct extent_io_tree *tree,
281 if (other->start == state->end + 1 && 281 if (other->start == state->end + 1 &&
282 other->state == state->state) { 282 other->state == state->state) {
283 merge_cb(tree, state, other); 283 merge_cb(tree, state, other);
284 other->start = state->start; 284 state->end = other->end;
285 state->tree = NULL; 285 other->tree = NULL;
286 rb_erase(&state->rb_node, &tree->state); 286 rb_erase(&other->rb_node, &tree->state);
287 free_extent_state(state); 287 free_extent_state(other);
288 state = NULL;
289 } 288 }
290 } 289 }
291
292 return 0;
293} 290}
294 291
295static int set_state_cb(struct extent_io_tree *tree, 292static void set_state_cb(struct extent_io_tree *tree,
296 struct extent_state *state, int *bits) 293 struct extent_state *state, int *bits)
297{ 294{
298 if (tree->ops && tree->ops->set_bit_hook) { 295 if (tree->ops && tree->ops->set_bit_hook)
299 return tree->ops->set_bit_hook(tree->mapping->host, 296 tree->ops->set_bit_hook(tree->mapping->host, state, bits);
300 state, bits);
301 }
302
303 return 0;
304} 297}
305 298
306static void clear_state_cb(struct extent_io_tree *tree, 299static void clear_state_cb(struct extent_io_tree *tree,
@@ -310,6 +303,9 @@ static void clear_state_cb(struct extent_io_tree *tree,
310 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 303 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
311} 304}
312 305
306static void set_state_bits(struct extent_io_tree *tree,
307 struct extent_state *state, int *bits);
308
313/* 309/*
314 * insert an extent_state struct into the tree. 'bits' are set on the 310 * insert an extent_state struct into the tree. 'bits' are set on the
315 * struct before it is inserted. 311 * struct before it is inserted.
@@ -325,8 +321,6 @@ static int insert_state(struct extent_io_tree *tree,
325 int *bits) 321 int *bits)
326{ 322{
327 struct rb_node *node; 323 struct rb_node *node;
328 int bits_to_set = *bits & ~EXTENT_CTLBITS;
329 int ret;
330 324
331 if (end < start) { 325 if (end < start) {
332 printk(KERN_ERR "btrfs end < start %llu %llu\n", 326 printk(KERN_ERR "btrfs end < start %llu %llu\n",
@@ -336,13 +330,9 @@ static int insert_state(struct extent_io_tree *tree,
336 } 330 }
337 state->start = start; 331 state->start = start;
338 state->end = end; 332 state->end = end;
339 ret = set_state_cb(tree, state, bits);
340 if (ret)
341 return ret;
342 333
343 if (bits_to_set & EXTENT_DIRTY) 334 set_state_bits(tree, state, bits);
344 tree->dirty_bytes += end - start + 1; 335
345 state->state |= bits_to_set;
346 node = tree_insert(&tree->state, end, &state->rb_node); 336 node = tree_insert(&tree->state, end, &state->rb_node);
347 if (node) { 337 if (node) {
348 struct extent_state *found; 338 struct extent_state *found;
@@ -351,7 +341,6 @@ static int insert_state(struct extent_io_tree *tree,
351 "%llu %llu\n", (unsigned long long)found->start, 341 "%llu %llu\n", (unsigned long long)found->start,
352 (unsigned long long)found->end, 342 (unsigned long long)found->end,
353 (unsigned long long)start, (unsigned long long)end); 343 (unsigned long long)start, (unsigned long long)end);
354 free_extent_state(state);
355 return -EEXIST; 344 return -EEXIST;
356 } 345 }
357 state->tree = tree; 346 state->tree = tree;
@@ -359,13 +348,11 @@ static int insert_state(struct extent_io_tree *tree,
359 return 0; 348 return 0;
360} 349}
361 350
362static int split_cb(struct extent_io_tree *tree, struct extent_state *orig, 351static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
363 u64 split) 352 u64 split)
364{ 353{
365 if (tree->ops && tree->ops->split_extent_hook) 354 if (tree->ops && tree->ops->split_extent_hook)
366 return tree->ops->split_extent_hook(tree->mapping->host, 355 tree->ops->split_extent_hook(tree->mapping->host, orig, split);
367 orig, split);
368 return 0;
369} 356}
370 357
371/* 358/*
@@ -500,7 +487,8 @@ again:
500 cached_state = NULL; 487 cached_state = NULL;
501 } 488 }
502 489
503 if (cached && cached->tree && cached->start == start) { 490 if (cached && cached->tree && cached->start <= start &&
491 cached->end > start) {
504 if (clear) 492 if (clear)
505 atomic_dec(&cached->refs); 493 atomic_dec(&cached->refs);
506 state = cached; 494 state = cached;
@@ -660,34 +648,25 @@ again:
660 if (start > end) 648 if (start > end)
661 break; 649 break;
662 650
663 if (need_resched()) { 651 cond_resched_lock(&tree->lock);
664 spin_unlock(&tree->lock);
665 cond_resched();
666 spin_lock(&tree->lock);
667 }
668 } 652 }
669out: 653out:
670 spin_unlock(&tree->lock); 654 spin_unlock(&tree->lock);
671 return 0; 655 return 0;
672} 656}
673 657
674static int set_state_bits(struct extent_io_tree *tree, 658static void set_state_bits(struct extent_io_tree *tree,
675 struct extent_state *state, 659 struct extent_state *state,
676 int *bits) 660 int *bits)
677{ 661{
678 int ret;
679 int bits_to_set = *bits & ~EXTENT_CTLBITS; 662 int bits_to_set = *bits & ~EXTENT_CTLBITS;
680 663
681 ret = set_state_cb(tree, state, bits); 664 set_state_cb(tree, state, bits);
682 if (ret)
683 return ret;
684 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { 665 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
685 u64 range = state->end - state->start + 1; 666 u64 range = state->end - state->start + 1;
686 tree->dirty_bytes += range; 667 tree->dirty_bytes += range;
687 } 668 }
688 state->state |= bits_to_set; 669 state->state |= bits_to_set;
689
690 return 0;
691} 670}
692 671
693static void cache_state(struct extent_state *state, 672static void cache_state(struct extent_state *state,
@@ -742,7 +721,8 @@ again:
742 spin_lock(&tree->lock); 721 spin_lock(&tree->lock);
743 if (cached_state && *cached_state) { 722 if (cached_state && *cached_state) {
744 state = *cached_state; 723 state = *cached_state;
745 if (state->start == start && state->tree) { 724 if (state->start <= start && state->end > start &&
725 state->tree) {
746 node = &state->rb_node; 726 node = &state->rb_node;
747 goto hit_next; 727 goto hit_next;
748 } 728 }
@@ -779,17 +759,15 @@ hit_next:
779 goto out; 759 goto out;
780 } 760 }
781 761
782 err = set_state_bits(tree, state, &bits); 762 set_state_bits(tree, state, &bits);
783 if (err)
784 goto out;
785 763
786 next_node = rb_next(node);
787 cache_state(state, cached_state); 764 cache_state(state, cached_state);
788 merge_state(tree, state); 765 merge_state(tree, state);
789 if (last_end == (u64)-1) 766 if (last_end == (u64)-1)
790 goto out; 767 goto out;
791 768
792 start = last_end + 1; 769 start = last_end + 1;
770 next_node = rb_next(&state->rb_node);
793 if (next_node && start < end && prealloc && !need_resched()) { 771 if (next_node && start < end && prealloc && !need_resched()) {
794 state = rb_entry(next_node, struct extent_state, 772 state = rb_entry(next_node, struct extent_state,
795 rb_node); 773 rb_node);
@@ -830,9 +808,7 @@ hit_next:
830 if (err) 808 if (err)
831 goto out; 809 goto out;
832 if (state->end <= end) { 810 if (state->end <= end) {
833 err = set_state_bits(tree, state, &bits); 811 set_state_bits(tree, state, &bits);
834 if (err)
835 goto out;
836 cache_state(state, cached_state); 812 cache_state(state, cached_state);
837 merge_state(tree, state); 813 merge_state(tree, state);
838 if (last_end == (u64)-1) 814 if (last_end == (u64)-1)
@@ -862,7 +838,6 @@ hit_next:
862 * Avoid to free 'prealloc' if it can be merged with 838 * Avoid to free 'prealloc' if it can be merged with
863 * the later extent. 839 * the later extent.
864 */ 840 */
865 atomic_inc(&prealloc->refs);
866 err = insert_state(tree, prealloc, start, this_end, 841 err = insert_state(tree, prealloc, start, this_end,
867 &bits); 842 &bits);
868 BUG_ON(err == -EEXIST); 843 BUG_ON(err == -EEXIST);
@@ -872,7 +847,6 @@ hit_next:
872 goto out; 847 goto out;
873 } 848 }
874 cache_state(prealloc, cached_state); 849 cache_state(prealloc, cached_state);
875 free_extent_state(prealloc);
876 prealloc = NULL; 850 prealloc = NULL;
877 start = this_end + 1; 851 start = this_end + 1;
878 goto search_again; 852 goto search_again;
@@ -895,11 +869,7 @@ hit_next:
895 err = split_state(tree, state, prealloc, end + 1); 869 err = split_state(tree, state, prealloc, end + 1);
896 BUG_ON(err == -EEXIST); 870 BUG_ON(err == -EEXIST);
897 871
898 err = set_state_bits(tree, prealloc, &bits); 872 set_state_bits(tree, prealloc, &bits);
899 if (err) {
900 prealloc = NULL;
901 goto out;
902 }
903 cache_state(prealloc, cached_state); 873 cache_state(prealloc, cached_state);
904 merge_state(tree, prealloc); 874 merge_state(tree, prealloc);
905 prealloc = NULL; 875 prealloc = NULL;
@@ -1061,46 +1031,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1061 return 0; 1031 return 0;
1062} 1032}
1063 1033
1064/*
1065 * find the first offset in the io tree with 'bits' set. zero is
1066 * returned if we find something, and *start_ret and *end_ret are
1067 * set to reflect the state struct that was found.
1068 *
1069 * If nothing was found, 1 is returned, < 0 on error
1070 */
1071int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1072 u64 *start_ret, u64 *end_ret, int bits)
1073{
1074 struct rb_node *node;
1075 struct extent_state *state;
1076 int ret = 1;
1077
1078 spin_lock(&tree->lock);
1079 /*
1080 * this search will find all the extents that end after
1081 * our range starts.
1082 */
1083 node = tree_search(tree, start);
1084 if (!node)
1085 goto out;
1086
1087 while (1) {
1088 state = rb_entry(node, struct extent_state, rb_node);
1089 if (state->end >= start && (state->state & bits)) {
1090 *start_ret = state->start;
1091 *end_ret = state->end;
1092 ret = 0;
1093 break;
1094 }
1095 node = rb_next(node);
1096 if (!node)
1097 break;
1098 }
1099out:
1100 spin_unlock(&tree->lock);
1101 return ret;
1102}
1103
1104/* find the first state struct with 'bits' set after 'start', and 1034/* find the first state struct with 'bits' set after 'start', and
1105 * return it. tree->lock must be held. NULL will returned if 1035 * return it. tree->lock must be held. NULL will returned if
1106 * nothing was found after 'start' 1036 * nothing was found after 'start'
@@ -1133,6 +1063,30 @@ out:
1133} 1063}
1134 1064
1135/* 1065/*
1066 * find the first offset in the io tree with 'bits' set. zero is
1067 * returned if we find something, and *start_ret and *end_ret are
1068 * set to reflect the state struct that was found.
1069 *
1070 * If nothing was found, 1 is returned, < 0 on error
1071 */
1072int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1073 u64 *start_ret, u64 *end_ret, int bits)
1074{
1075 struct extent_state *state;
1076 int ret = 1;
1077
1078 spin_lock(&tree->lock);
1079 state = find_first_extent_bit_state(tree, start, bits);
1080 if (state) {
1081 *start_ret = state->start;
1082 *end_ret = state->end;
1083 ret = 0;
1084 }
1085 spin_unlock(&tree->lock);
1086 return ret;
1087}
1088
1089/*
1136 * find a contiguous range of bytes in the file marked as delalloc, not 1090 * find a contiguous range of bytes in the file marked as delalloc, not
1137 * more than 'max_bytes'. start and end are used to return the range, 1091 * more than 'max_bytes'. start and end are used to return the range,
1138 * 1092 *
@@ -1564,7 +1518,8 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1564 int bitset = 0; 1518 int bitset = 0;
1565 1519
1566 spin_lock(&tree->lock); 1520 spin_lock(&tree->lock);
1567 if (cached && cached->tree && cached->start == start) 1521 if (cached && cached->tree && cached->start <= start &&
1522 cached->end > start)
1568 node = &cached->rb_node; 1523 node = &cached->rb_node;
1569 else 1524 else
1570 node = tree_search(tree, start); 1525 node = tree_search(tree, start);
@@ -2432,6 +2387,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2432 pgoff_t index; 2387 pgoff_t index;
2433 pgoff_t end; /* Inclusive */ 2388 pgoff_t end; /* Inclusive */
2434 int scanned = 0; 2389 int scanned = 0;
2390 int tag;
2435 2391
2436 pagevec_init(&pvec, 0); 2392 pagevec_init(&pvec, 0);
2437 if (wbc->range_cyclic) { 2393 if (wbc->range_cyclic) {
@@ -2442,11 +2398,16 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2442 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2398 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2443 scanned = 1; 2399 scanned = 1;
2444 } 2400 }
2401 if (wbc->sync_mode == WB_SYNC_ALL)
2402 tag = PAGECACHE_TAG_TOWRITE;
2403 else
2404 tag = PAGECACHE_TAG_DIRTY;
2445retry: 2405retry:
2406 if (wbc->sync_mode == WB_SYNC_ALL)
2407 tag_pages_for_writeback(mapping, index, end);
2446 while (!done && !nr_to_write_done && (index <= end) && 2408 while (!done && !nr_to_write_done && (index <= end) &&
2447 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 2409 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2448 PAGECACHE_TAG_DIRTY, min(end - index, 2410 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2449 (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2450 unsigned i; 2411 unsigned i;
2451 2412
2452 scanned = 1; 2413 scanned = 1;
@@ -2541,7 +2502,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2541 struct writeback_control *wbc) 2502 struct writeback_control *wbc)
2542{ 2503{
2543 int ret; 2504 int ret;
2544 struct address_space *mapping = page->mapping;
2545 struct extent_page_data epd = { 2505 struct extent_page_data epd = {
2546 .bio = NULL, 2506 .bio = NULL,
2547 .tree = tree, 2507 .tree = tree,
@@ -2549,18 +2509,9 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2549 .extent_locked = 0, 2509 .extent_locked = 0,
2550 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2510 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2551 }; 2511 };
2552 struct writeback_control wbc_writepages = {
2553 .sync_mode = wbc->sync_mode,
2554 .older_than_this = NULL,
2555 .nr_to_write = 64,
2556 .range_start = page_offset(page) + PAGE_CACHE_SIZE,
2557 .range_end = (loff_t)-1,
2558 };
2559 2512
2560 ret = __extent_writepage(page, wbc, &epd); 2513 ret = __extent_writepage(page, wbc, &epd);
2561 2514
2562 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2563 __extent_writepage, &epd, flush_write_bio);
2564 flush_epd_write_bio(&epd); 2515 flush_epd_write_bio(&epd);
2565 return ret; 2516 return ret;
2566} 2517}
@@ -2584,7 +2535,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2584 }; 2535 };
2585 struct writeback_control wbc_writepages = { 2536 struct writeback_control wbc_writepages = {
2586 .sync_mode = mode, 2537 .sync_mode = mode,
2587 .older_than_this = NULL,
2588 .nr_to_write = nr_pages * 2, 2538 .nr_to_write = nr_pages * 2,
2589 .range_start = start, 2539 .range_start = start,
2590 .range_end = end + 1, 2540 .range_end = end + 1,
@@ -3022,8 +2972,15 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3022 return NULL; 2972 return NULL;
3023 eb->start = start; 2973 eb->start = start;
3024 eb->len = len; 2974 eb->len = len;
3025 spin_lock_init(&eb->lock); 2975 rwlock_init(&eb->lock);
3026 init_waitqueue_head(&eb->lock_wq); 2976 atomic_set(&eb->write_locks, 0);
2977 atomic_set(&eb->read_locks, 0);
2978 atomic_set(&eb->blocking_readers, 0);
2979 atomic_set(&eb->blocking_writers, 0);
2980 atomic_set(&eb->spinning_readers, 0);
2981 atomic_set(&eb->spinning_writers, 0);
2982 init_waitqueue_head(&eb->write_lock_wq);
2983 init_waitqueue_head(&eb->read_lock_wq);
3027 2984
3028#if LEAK_DEBUG 2985#if LEAK_DEBUG
3029 spin_lock_irqsave(&leak_lock, flags); 2986 spin_lock_irqsave(&leak_lock, flags);
@@ -3119,7 +3076,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3119 i = 0; 3076 i = 0;
3120 } 3077 }
3121 for (; i < num_pages; i++, index++) { 3078 for (; i < num_pages; i++, index++) {
3122 p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM); 3079 p = find_or_create_page(mapping, index, GFP_NOFS);
3123 if (!p) { 3080 if (!p) {
3124 WARN_ON(1); 3081 WARN_ON(1);
3125 goto free_eb; 3082 goto free_eb;
@@ -3266,6 +3223,22 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3266 return was_dirty; 3223 return was_dirty;
3267} 3224}
3268 3225
3226static int __eb_straddles_pages(u64 start, u64 len)
3227{
3228 if (len < PAGE_CACHE_SIZE)
3229 return 1;
3230 if (start & (PAGE_CACHE_SIZE - 1))
3231 return 1;
3232 if ((start + len) & (PAGE_CACHE_SIZE - 1))
3233 return 1;
3234 return 0;
3235}
3236
3237static int eb_straddles_pages(struct extent_buffer *eb)
3238{
3239 return __eb_straddles_pages(eb->start, eb->len);
3240}
3241
3269int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3242int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3270 struct extent_buffer *eb, 3243 struct extent_buffer *eb,
3271 struct extent_state **cached_state) 3244 struct extent_state **cached_state)
@@ -3277,8 +3250,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3277 num_pages = num_extent_pages(eb->start, eb->len); 3250 num_pages = num_extent_pages(eb->start, eb->len);
3278 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3251 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3279 3252
3280 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3253 if (eb_straddles_pages(eb)) {
3281 cached_state, GFP_NOFS); 3254 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3255 cached_state, GFP_NOFS);
3256 }
3282 for (i = 0; i < num_pages; i++) { 3257 for (i = 0; i < num_pages; i++) {
3283 page = extent_buffer_page(eb, i); 3258 page = extent_buffer_page(eb, i);
3284 if (page) 3259 if (page)
@@ -3296,8 +3271,10 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3296 3271
3297 num_pages = num_extent_pages(eb->start, eb->len); 3272 num_pages = num_extent_pages(eb->start, eb->len);
3298 3273
3299 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3274 if (eb_straddles_pages(eb)) {
3300 NULL, GFP_NOFS); 3275 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3276 NULL, GFP_NOFS);
3277 }
3301 for (i = 0; i < num_pages; i++) { 3278 for (i = 0; i < num_pages; i++) {
3302 page = extent_buffer_page(eb, i); 3279 page = extent_buffer_page(eb, i);
3303 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3280 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3320,9 +3297,12 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3320 int uptodate; 3297 int uptodate;
3321 unsigned long index; 3298 unsigned long index;
3322 3299
3323 ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL); 3300 if (__eb_straddles_pages(start, end - start + 1)) {
3324 if (ret) 3301 ret = test_range_bit(tree, start, end,
3325 return 1; 3302 EXTENT_UPTODATE, 1, NULL);
3303 if (ret)
3304 return 1;
3305 }
3326 while (start <= end) { 3306 while (start <= end) {
3327 index = start >> PAGE_CACHE_SHIFT; 3307 index = start >> PAGE_CACHE_SHIFT;
3328 page = find_get_page(tree->mapping, index); 3308 page = find_get_page(tree->mapping, index);
@@ -3350,10 +3330,12 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3350 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3330 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3351 return 1; 3331 return 1;
3352 3332
3353 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3333 if (eb_straddles_pages(eb)) {
3354 EXTENT_UPTODATE, 1, cached_state); 3334 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3355 if (ret) 3335 EXTENT_UPTODATE, 1, cached_state);
3356 return ret; 3336 if (ret)
3337 return ret;
3338 }
3357 3339
3358 num_pages = num_extent_pages(eb->start, eb->len); 3340 num_pages = num_extent_pages(eb->start, eb->len);
3359 for (i = 0; i < num_pages; i++) { 3341 for (i = 0; i < num_pages; i++) {
@@ -3386,9 +3368,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) 3368 if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3387 return 0; 3369 return 0;
3388 3370
3389 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3371 if (eb_straddles_pages(eb)) {
3390 EXTENT_UPTODATE, 1, NULL)) { 3372 if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3391 return 0; 3373 EXTENT_UPTODATE, 1, NULL)) {
3374 return 0;
3375 }
3392 } 3376 }
3393 3377
3394 if (start) { 3378 if (start) {
@@ -3492,9 +3476,8 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3492 page = extent_buffer_page(eb, i); 3476 page = extent_buffer_page(eb, i);
3493 3477
3494 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3478 cur = min(len, (PAGE_CACHE_SIZE - offset));
3495 kaddr = kmap_atomic(page, KM_USER1); 3479 kaddr = page_address(page);
3496 memcpy(dst, kaddr + offset, cur); 3480 memcpy(dst, kaddr + offset, cur);
3497 kunmap_atomic(kaddr, KM_USER1);
3498 3481
3499 dst += cur; 3482 dst += cur;
3500 len -= cur; 3483 len -= cur;
@@ -3504,9 +3487,9 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3504} 3487}
3505 3488
3506int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, 3489int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3507 unsigned long min_len, char **token, char **map, 3490 unsigned long min_len, char **map,
3508 unsigned long *map_start, 3491 unsigned long *map_start,
3509 unsigned long *map_len, int km) 3492 unsigned long *map_len)
3510{ 3493{
3511 size_t offset = start & (PAGE_CACHE_SIZE - 1); 3494 size_t offset = start & (PAGE_CACHE_SIZE - 1);
3512 char *kaddr; 3495 char *kaddr;
@@ -3536,42 +3519,12 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3536 } 3519 }
3537 3520
3538 p = extent_buffer_page(eb, i); 3521 p = extent_buffer_page(eb, i);
3539 kaddr = kmap_atomic(p, km); 3522 kaddr = page_address(p);
3540 *token = kaddr;
3541 *map = kaddr + offset; 3523 *map = kaddr + offset;
3542 *map_len = PAGE_CACHE_SIZE - offset; 3524 *map_len = PAGE_CACHE_SIZE - offset;
3543 return 0; 3525 return 0;
3544} 3526}
3545 3527
3546int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3547 unsigned long min_len,
3548 char **token, char **map,
3549 unsigned long *map_start,
3550 unsigned long *map_len, int km)
3551{
3552 int err;
3553 int save = 0;
3554 if (eb->map_token) {
3555 unmap_extent_buffer(eb, eb->map_token, km);
3556 eb->map_token = NULL;
3557 save = 1;
3558 }
3559 err = map_private_extent_buffer(eb, start, min_len, token, map,
3560 map_start, map_len, km);
3561 if (!err && save) {
3562 eb->map_token = *token;
3563 eb->kaddr = *map;
3564 eb->map_start = *map_start;
3565 eb->map_len = *map_len;
3566 }
3567 return err;
3568}
3569
3570void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3571{
3572 kunmap_atomic(token, km);
3573}
3574
3575int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, 3528int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3576 unsigned long start, 3529 unsigned long start,
3577 unsigned long len) 3530 unsigned long len)
@@ -3595,9 +3548,8 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3595 3548
3596 cur = min(len, (PAGE_CACHE_SIZE - offset)); 3549 cur = min(len, (PAGE_CACHE_SIZE - offset));
3597 3550
3598 kaddr = kmap_atomic(page, KM_USER0); 3551 kaddr = page_address(page);
3599 ret = memcmp(ptr, kaddr + offset, cur); 3552 ret = memcmp(ptr, kaddr + offset, cur);
3600 kunmap_atomic(kaddr, KM_USER0);
3601 if (ret) 3553 if (ret)
3602 break; 3554 break;
3603 3555
@@ -3630,9 +3582,8 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3630 WARN_ON(!PageUptodate(page)); 3582 WARN_ON(!PageUptodate(page));
3631 3583
3632 cur = min(len, PAGE_CACHE_SIZE - offset); 3584 cur = min(len, PAGE_CACHE_SIZE - offset);
3633 kaddr = kmap_atomic(page, KM_USER1); 3585 kaddr = page_address(page);
3634 memcpy(kaddr + offset, src, cur); 3586 memcpy(kaddr + offset, src, cur);
3635 kunmap_atomic(kaddr, KM_USER1);
3636 3587
3637 src += cur; 3588 src += cur;
3638 len -= cur; 3589 len -= cur;
@@ -3661,9 +3612,8 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
3661 WARN_ON(!PageUptodate(page)); 3612 WARN_ON(!PageUptodate(page));
3662 3613
3663 cur = min(len, PAGE_CACHE_SIZE - offset); 3614 cur = min(len, PAGE_CACHE_SIZE - offset);
3664 kaddr = kmap_atomic(page, KM_USER0); 3615 kaddr = page_address(page);
3665 memset(kaddr + offset, c, cur); 3616 memset(kaddr + offset, c, cur);
3666 kunmap_atomic(kaddr, KM_USER0);
3667 3617
3668 len -= cur; 3618 len -= cur;
3669 offset = 0; 3619 offset = 0;
@@ -3694,9 +3644,8 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3694 3644
3695 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); 3645 cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3696 3646
3697 kaddr = kmap_atomic(page, KM_USER0); 3647 kaddr = page_address(page);
3698 read_extent_buffer(src, kaddr + offset, src_offset, cur); 3648 read_extent_buffer(src, kaddr + offset, src_offset, cur);
3699 kunmap_atomic(kaddr, KM_USER0);
3700 3649
3701 src_offset += cur; 3650 src_offset += cur;
3702 len -= cur; 3651 len -= cur;
@@ -3709,20 +3658,17 @@ static void move_pages(struct page *dst_page, struct page *src_page,
3709 unsigned long dst_off, unsigned long src_off, 3658 unsigned long dst_off, unsigned long src_off,
3710 unsigned long len) 3659 unsigned long len)
3711{ 3660{
3712 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3661 char *dst_kaddr = page_address(dst_page);
3713 if (dst_page == src_page) { 3662 if (dst_page == src_page) {
3714 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); 3663 memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3715 } else { 3664 } else {
3716 char *src_kaddr = kmap_atomic(src_page, KM_USER1); 3665 char *src_kaddr = page_address(src_page);
3717 char *p = dst_kaddr + dst_off + len; 3666 char *p = dst_kaddr + dst_off + len;
3718 char *s = src_kaddr + src_off + len; 3667 char *s = src_kaddr + src_off + len;
3719 3668
3720 while (len--) 3669 while (len--)
3721 *--p = *--s; 3670 *--p = *--s;
3722
3723 kunmap_atomic(src_kaddr, KM_USER1);
3724 } 3671 }
3725 kunmap_atomic(dst_kaddr, KM_USER0);
3726} 3672}
3727 3673
3728static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) 3674static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
@@ -3735,20 +3681,17 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
3735 unsigned long dst_off, unsigned long src_off, 3681 unsigned long dst_off, unsigned long src_off,
3736 unsigned long len) 3682 unsigned long len)
3737{ 3683{
3738 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3684 char *dst_kaddr = page_address(dst_page);
3739 char *src_kaddr; 3685 char *src_kaddr;
3740 3686
3741 if (dst_page != src_page) { 3687 if (dst_page != src_page) {
3742 src_kaddr = kmap_atomic(src_page, KM_USER1); 3688 src_kaddr = page_address(src_page);
3743 } else { 3689 } else {
3744 src_kaddr = dst_kaddr; 3690 src_kaddr = dst_kaddr;
3745 BUG_ON(areas_overlap(src_off, dst_off, len)); 3691 BUG_ON(areas_overlap(src_off, dst_off, len));
3746 } 3692 }
3747 3693
3748 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3694 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3749 kunmap_atomic(dst_kaddr, KM_USER0);
3750 if (dst_page != src_page)
3751 kunmap_atomic(src_kaddr, KM_USER1);
3752} 3695}
3753 3696
3754void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, 3697void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a11a92ee2d3..7b2f0c3e792 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -76,15 +76,15 @@ struct extent_io_ops {
76 struct extent_state *state); 76 struct extent_state *state);
77 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 77 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
78 struct extent_state *state, int uptodate); 78 struct extent_state *state, int uptodate);
79 int (*set_bit_hook)(struct inode *inode, struct extent_state *state, 79 void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
80 int *bits); 80 int *bits);
81 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 81 void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
82 int *bits); 82 int *bits);
83 int (*merge_extent_hook)(struct inode *inode, 83 void (*merge_extent_hook)(struct inode *inode,
84 struct extent_state *new, 84 struct extent_state *new,
85 struct extent_state *other); 85 struct extent_state *other);
86 int (*split_extent_hook)(struct inode *inode, 86 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 87 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 88 int (*write_cache_pages_lock_hook)(struct page *page);
89}; 89};
90 90
@@ -108,8 +108,6 @@ struct extent_state {
108 wait_queue_head_t wq; 108 wait_queue_head_t wq;
109 atomic_t refs; 109 atomic_t refs;
110 unsigned long state; 110 unsigned long state;
111 u64 split_start;
112 u64 split_end;
113 111
114 /* for use by the FS */ 112 /* for use by the FS */
115 u64 private; 113 u64 private;
@@ -120,8 +118,6 @@ struct extent_state {
120struct extent_buffer { 118struct extent_buffer {
121 u64 start; 119 u64 start;
122 unsigned long len; 120 unsigned long len;
123 char *map_token;
124 char *kaddr;
125 unsigned long map_start; 121 unsigned long map_start;
126 unsigned long map_len; 122 unsigned long map_len;
127 struct page *first_page; 123 struct page *first_page;
@@ -130,14 +126,26 @@ struct extent_buffer {
130 struct rcu_head rcu_head; 126 struct rcu_head rcu_head;
131 atomic_t refs; 127 atomic_t refs;
132 128
133 /* the spinlock is used to protect most operations */ 129 /* count of read lock holders on the extent buffer */
134 spinlock_t lock; 130 atomic_t write_locks;
131 atomic_t read_locks;
132 atomic_t blocking_writers;
133 atomic_t blocking_readers;
134 atomic_t spinning_readers;
135 atomic_t spinning_writers;
136
137 /* protects write locks */
138 rwlock_t lock;
135 139
136 /* 140 /* readers use lock_wq while they wait for the write
137 * when we keep the lock held while blocking, waiters go onto 141 * lock holders to unlock
138 * the wq
139 */ 142 */
140 wait_queue_head_t lock_wq; 143 wait_queue_head_t write_lock_wq;
144
145 /* writers use read_lock_wq while they wait for readers
146 * to unlock
147 */
148 wait_queue_head_t read_lock_wq;
141}; 149};
142 150
143static inline void extent_set_compress_type(unsigned long *bio_flags, 151static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -279,15 +287,10 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
279int extent_buffer_uptodate(struct extent_io_tree *tree, 287int extent_buffer_uptodate(struct extent_io_tree *tree,
280 struct extent_buffer *eb, 288 struct extent_buffer *eb,
281 struct extent_state *cached_state); 289 struct extent_state *cached_state);
282int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
283 unsigned long min_len, char **token, char **map,
284 unsigned long *map_start,
285 unsigned long *map_len, int km);
286int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, 290int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
287 unsigned long min_len, char **token, char **map, 291 unsigned long min_len, char **map,
288 unsigned long *map_start, 292 unsigned long *map_start,
289 unsigned long *map_len, int km); 293 unsigned long *map_len);
290void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
291int extent_range_uptodate(struct extent_io_tree *tree, 294int extent_range_uptodate(struct extent_io_tree *tree,
292 u64 start, u64 end); 295 u64 start, u64 end);
293int extent_clear_unlock_delalloc(struct inode *inode, 296int extent_clear_unlock_delalloc(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 2d0410344ea..7c97b330145 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -183,22 +183,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
183 return 0; 183 return 0;
184} 184}
185 185
186int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len) 186static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
187{ 187{
188 int ret = 0;
189 struct extent_map *merge = NULL; 188 struct extent_map *merge = NULL;
190 struct rb_node *rb; 189 struct rb_node *rb;
191 struct extent_map *em;
192
193 write_lock(&tree->lock);
194 em = lookup_extent_mapping(tree, start, len);
195
196 WARN_ON(!em || em->start != start);
197
198 if (!em)
199 goto out;
200
201 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
202 190
203 if (em->start != 0) { 191 if (em->start != 0) {
204 rb = rb_prev(&em->rb_node); 192 rb = rb_prev(&em->rb_node);
@@ -225,6 +213,24 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
225 merge->in_tree = 0; 213 merge->in_tree = 0;
226 free_extent_map(merge); 214 free_extent_map(merge);
227 } 215 }
216}
217
218int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
219{
220 int ret = 0;
221 struct extent_map *em;
222
223 write_lock(&tree->lock);
224 em = lookup_extent_mapping(tree, start, len);
225
226 WARN_ON(!em || em->start != start);
227
228 if (!em)
229 goto out;
230
231 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
232
233 try_merge_map(tree, em);
228 234
229 free_extent_map(em); 235 free_extent_map(em);
230out: 236out:
@@ -247,7 +253,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
247 struct extent_map *em) 253 struct extent_map *em)
248{ 254{
249 int ret = 0; 255 int ret = 0;
250 struct extent_map *merge = NULL;
251 struct rb_node *rb; 256 struct rb_node *rb;
252 struct extent_map *exist; 257 struct extent_map *exist;
253 258
@@ -263,30 +268,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
263 goto out; 268 goto out;
264 } 269 }
265 atomic_inc(&em->refs); 270 atomic_inc(&em->refs);
266 if (em->start != 0) { 271
267 rb = rb_prev(&em->rb_node); 272 try_merge_map(tree, em);
268 if (rb)
269 merge = rb_entry(rb, struct extent_map, rb_node);
270 if (rb && mergable_maps(merge, em)) {
271 em->start = merge->start;
272 em->len += merge->len;
273 em->block_len += merge->block_len;
274 em->block_start = merge->block_start;
275 merge->in_tree = 0;
276 rb_erase(&merge->rb_node, &tree->map);
277 free_extent_map(merge);
278 }
279 }
280 rb = rb_next(&em->rb_node);
281 if (rb)
282 merge = rb_entry(rb, struct extent_map, rb_node);
283 if (rb && mergable_maps(em, merge)) {
284 em->len += merge->len;
285 em->block_len += merge->len;
286 rb_erase(&merge->rb_node, &tree->map);
287 merge->in_tree = 0;
288 free_extent_map(merge);
289 }
290out: 273out:
291 return ret; 274 return ret;
292} 275}
@@ -299,19 +282,8 @@ static u64 range_end(u64 start, u64 len)
299 return start + len; 282 return start + len;
300} 283}
301 284
302/** 285struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree,
303 * lookup_extent_mapping - lookup extent_map 286 u64 start, u64 len, int strict)
304 * @tree: tree to lookup in
305 * @start: byte offset to start the search
306 * @len: length of the lookup range
307 *
308 * Find and return the first extent_map struct in @tree that intersects the
309 * [start, len] range. There may be additional objects in the tree that
310 * intersect, so check the object returned carefully to make sure that no
311 * additional lookups are needed.
312 */
313struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
314 u64 start, u64 len)
315{ 287{
316 struct extent_map *em; 288 struct extent_map *em;
317 struct rb_node *rb_node; 289 struct rb_node *rb_node;
@@ -320,38 +292,42 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
320 u64 end = range_end(start, len); 292 u64 end = range_end(start, len);
321 293
322 rb_node = __tree_search(&tree->map, start, &prev, &next); 294 rb_node = __tree_search(&tree->map, start, &prev, &next);
323 if (!rb_node && prev) {
324 em = rb_entry(prev, struct extent_map, rb_node);
325 if (end > em->start && start < extent_map_end(em))
326 goto found;
327 }
328 if (!rb_node && next) {
329 em = rb_entry(next, struct extent_map, rb_node);
330 if (end > em->start && start < extent_map_end(em))
331 goto found;
332 }
333 if (!rb_node) { 295 if (!rb_node) {
334 em = NULL; 296 if (prev)
335 goto out; 297 rb_node = prev;
336 } 298 else if (next)
337 if (IS_ERR(rb_node)) { 299 rb_node = next;
338 em = ERR_CAST(rb_node); 300 else
339 goto out; 301 return NULL;
340 } 302 }
303
341 em = rb_entry(rb_node, struct extent_map, rb_node); 304 em = rb_entry(rb_node, struct extent_map, rb_node);
342 if (end > em->start && start < extent_map_end(em))
343 goto found;
344 305
345 em = NULL; 306 if (strict && !(end > em->start && start < extent_map_end(em)))
346 goto out; 307 return NULL;
347 308
348found:
349 atomic_inc(&em->refs); 309 atomic_inc(&em->refs);
350out:
351 return em; 310 return em;
352} 311}
353 312
354/** 313/**
314 * lookup_extent_mapping - lookup extent_map
315 * @tree: tree to lookup in
316 * @start: byte offset to start the search
317 * @len: length of the lookup range
318 *
319 * Find and return the first extent_map struct in @tree that intersects the
320 * [start, len] range. There may be additional objects in the tree that
321 * intersect, so check the object returned carefully to make sure that no
322 * additional lookups are needed.
323 */
324struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
325 u64 start, u64 len)
326{
327 return __lookup_extent_mapping(tree, start, len, 1);
328}
329
330/**
355 * search_extent_mapping - find a nearby extent map 331 * search_extent_mapping - find a nearby extent map
356 * @tree: tree to lookup in 332 * @tree: tree to lookup in
357 * @start: byte offset to start the search 333 * @start: byte offset to start the search
@@ -365,38 +341,7 @@ out:
365struct extent_map *search_extent_mapping(struct extent_map_tree *tree, 341struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
366 u64 start, u64 len) 342 u64 start, u64 len)
367{ 343{
368 struct extent_map *em; 344 return __lookup_extent_mapping(tree, start, len, 0);
369 struct rb_node *rb_node;
370 struct rb_node *prev = NULL;
371 struct rb_node *next = NULL;
372
373 rb_node = __tree_search(&tree->map, start, &prev, &next);
374 if (!rb_node && prev) {
375 em = rb_entry(prev, struct extent_map, rb_node);
376 goto found;
377 }
378 if (!rb_node && next) {
379 em = rb_entry(next, struct extent_map, rb_node);
380 goto found;
381 }
382 if (!rb_node) {
383 em = NULL;
384 goto out;
385 }
386 if (IS_ERR(rb_node)) {
387 em = ERR_CAST(rb_node);
388 goto out;
389 }
390 em = rb_entry(rb_node, struct extent_map, rb_node);
391 goto found;
392
393 em = NULL;
394 goto out;
395
396found:
397 atomic_inc(&em->refs);
398out:
399 return em;
400} 345}
401 346
402/** 347/**
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 90d4ee52cd4..a1cb7821bec 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -177,6 +177,17 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
177 177
178 WARN_ON(bio->bi_vcnt <= 0); 178 WARN_ON(bio->bi_vcnt <= 0);
179 179
180 /*
181 * the free space stuff is only read when it hasn't been
182 * updated in the current transaction. So, we can safely
183 * read from the commit root and sidestep a nasty deadlock
184 * between reading the free space cache and updating the csum tree.
185 */
186 if (btrfs_is_free_space_inode(root, inode)) {
187 path->search_commit_root = 1;
188 path->skip_locking = 1;
189 }
190
180 disk_bytenr = (u64)bio->bi_sector << 9; 191 disk_bytenr = (u64)bio->bi_sector << 9;
181 if (dio) 192 if (dio)
182 offset = logical_offset; 193 offset = logical_offset;
@@ -282,7 +293,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
282 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
283 294
284 path = btrfs_alloc_path(); 295 path = btrfs_alloc_path();
285 BUG_ON(!path); 296 if (!path)
297 return -ENOMEM;
286 298
287 if (search_commit) { 299 if (search_commit) {
288 path->skip_locking = 1; 300 path->skip_locking = 1;
@@ -664,15 +676,13 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
664 struct btrfs_sector_sum *sector_sum; 676 struct btrfs_sector_sum *sector_sum;
665 u32 nritems; 677 u32 nritems;
666 u32 ins_size; 678 u32 ins_size;
667 char *eb_map;
668 char *eb_token;
669 unsigned long map_len;
670 unsigned long map_start;
671 u16 csum_size = 679 u16 csum_size =
672 btrfs_super_csum_size(&root->fs_info->super_copy); 680 btrfs_super_csum_size(&root->fs_info->super_copy);
673 681
674 path = btrfs_alloc_path(); 682 path = btrfs_alloc_path();
675 BUG_ON(!path); 683 if (!path)
684 return -ENOMEM;
685
676 sector_sum = sums->sums; 686 sector_sum = sums->sums;
677again: 687again:
678 next_offset = (u64)-1; 688 next_offset = (u64)-1;
@@ -814,30 +824,9 @@ found:
814 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); 824 item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
815 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + 825 item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
816 btrfs_item_size_nr(leaf, path->slots[0])); 826 btrfs_item_size_nr(leaf, path->slots[0]));
817 eb_token = NULL;
818next_sector: 827next_sector:
819 828
820 if (!eb_token || 829 write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
821 (unsigned long)item + csum_size >= map_start + map_len) {
822 int err;
823
824 if (eb_token)
825 unmap_extent_buffer(leaf, eb_token, KM_USER1);
826 eb_token = NULL;
827 err = map_private_extent_buffer(leaf, (unsigned long)item,
828 csum_size,
829 &eb_token, &eb_map,
830 &map_start, &map_len, KM_USER1);
831 if (err)
832 eb_token = NULL;
833 }
834 if (eb_token) {
835 memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
836 &sector_sum->sum, csum_size);
837 } else {
838 write_extent_buffer(leaf, &sector_sum->sum,
839 (unsigned long)item, csum_size);
840 }
841 830
842 total_bytes += root->sectorsize; 831 total_bytes += root->sectorsize;
843 sector_sum++; 832 sector_sum++;
@@ -850,10 +839,7 @@ next_sector:
850 goto next_sector; 839 goto next_sector;
851 } 840 }
852 } 841 }
853 if (eb_token) { 842
854 unmap_extent_buffer(leaf, eb_token, KM_USER1);
855 eb_token = NULL;
856 }
857 btrfs_mark_buffer_dirty(path->nodes[0]); 843 btrfs_mark_buffer_dirty(path->nodes[0]);
858 if (total_bytes < sums->len) { 844 if (total_bytes < sums->len) {
859 btrfs_release_path(path); 845 btrfs_release_path(path);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index fa4ef18b66b..e4e57d59edb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -74,7 +74,7 @@ struct inode_defrag {
74 * If an existing record is found the defrag item you 74 * If an existing record is found the defrag item you
75 * pass in is freed 75 * pass in is freed
76 */ 76 */
77static int __btrfs_add_inode_defrag(struct inode *inode, 77static void __btrfs_add_inode_defrag(struct inode *inode,
78 struct inode_defrag *defrag) 78 struct inode_defrag *defrag)
79{ 79{
80 struct btrfs_root *root = BTRFS_I(inode)->root; 80 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -106,11 +106,11 @@ static int __btrfs_add_inode_defrag(struct inode *inode,
106 BTRFS_I(inode)->in_defrag = 1; 106 BTRFS_I(inode)->in_defrag = 1;
107 rb_link_node(&defrag->rb_node, parent, p); 107 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes); 108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return 0; 109 return;
110 110
111exists: 111exists:
112 kfree(defrag); 112 kfree(defrag);
113 return 0; 113 return;
114 114
115} 115}
116 116
@@ -123,7 +123,6 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
123{ 123{
124 struct btrfs_root *root = BTRFS_I(inode)->root; 124 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct inode_defrag *defrag; 125 struct inode_defrag *defrag;
126 int ret = 0;
127 u64 transid; 126 u64 transid;
128 127
129 if (!btrfs_test_opt(root, AUTO_DEFRAG)) 128 if (!btrfs_test_opt(root, AUTO_DEFRAG))
@@ -150,9 +149,11 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
150 149
151 spin_lock(&root->fs_info->defrag_inodes_lock); 150 spin_lock(&root->fs_info->defrag_inodes_lock);
152 if (!BTRFS_I(inode)->in_defrag) 151 if (!BTRFS_I(inode)->in_defrag)
153 ret = __btrfs_add_inode_defrag(inode, defrag); 152 __btrfs_add_inode_defrag(inode, defrag);
153 else
154 kfree(defrag);
154 spin_unlock(&root->fs_info->defrag_inodes_lock); 155 spin_unlock(&root->fs_info->defrag_inodes_lock);
155 return ret; 156 return 0;
156} 157}
157 158
158/* 159/*
@@ -855,7 +856,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
855 btrfs_drop_extent_cache(inode, start, end - 1, 0); 856 btrfs_drop_extent_cache(inode, start, end - 1, 0);
856 857
857 path = btrfs_alloc_path(); 858 path = btrfs_alloc_path();
858 BUG_ON(!path); 859 if (!path)
860 return -ENOMEM;
859again: 861again:
860 recow = 0; 862 recow = 0;
861 split = start; 863 split = start;
@@ -1034,11 +1036,13 @@ out:
1034 * on error we return an unlocked page and the error value 1036 * on error we return an unlocked page and the error value
1035 * on success we return a locked page and 0 1037 * on success we return a locked page and 0
1036 */ 1038 */
1037static int prepare_uptodate_page(struct page *page, u64 pos) 1039static int prepare_uptodate_page(struct page *page, u64 pos,
1040 bool force_uptodate)
1038{ 1041{
1039 int ret = 0; 1042 int ret = 0;
1040 1043
1041 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) { 1044 if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
1045 !PageUptodate(page)) {
1042 ret = btrfs_readpage(NULL, page); 1046 ret = btrfs_readpage(NULL, page);
1043 if (ret) 1047 if (ret)
1044 return ret; 1048 return ret;
@@ -1059,7 +1063,7 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
1059static noinline int prepare_pages(struct btrfs_root *root, struct file *file, 1063static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1060 struct page **pages, size_t num_pages, 1064 struct page **pages, size_t num_pages,
1061 loff_t pos, unsigned long first_index, 1065 loff_t pos, unsigned long first_index,
1062 unsigned long last_index, size_t write_bytes) 1066 size_t write_bytes, bool force_uptodate)
1063{ 1067{
1064 struct extent_state *cached_state = NULL; 1068 struct extent_state *cached_state = NULL;
1065 int i; 1069 int i;
@@ -1073,15 +1077,10 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1073 start_pos = pos & ~((u64)root->sectorsize - 1); 1077 start_pos = pos & ~((u64)root->sectorsize - 1);
1074 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 1078 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
1075 1079
1076 if (start_pos > inode->i_size) {
1077 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1078 if (err)
1079 return err;
1080 }
1081
1082again: 1080again:
1083 for (i = 0; i < num_pages; i++) { 1081 for (i = 0; i < num_pages; i++) {
1084 pages[i] = grab_cache_page(inode->i_mapping, index + i); 1082 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS);
1085 if (!pages[i]) { 1084 if (!pages[i]) {
1086 faili = i - 1; 1085 faili = i - 1;
1087 err = -ENOMEM; 1086 err = -ENOMEM;
@@ -1089,10 +1088,11 @@ again:
1089 } 1088 }
1090 1089
1091 if (i == 0) 1090 if (i == 0)
1092 err = prepare_uptodate_page(pages[i], pos); 1091 err = prepare_uptodate_page(pages[i], pos,
1092 force_uptodate);
1093 if (i == num_pages - 1) 1093 if (i == num_pages - 1)
1094 err = prepare_uptodate_page(pages[i], 1094 err = prepare_uptodate_page(pages[i],
1095 pos + write_bytes); 1095 pos + write_bytes, false);
1096 if (err) { 1096 if (err) {
1097 page_cache_release(pages[i]); 1097 page_cache_release(pages[i]);
1098 faili = i - 1; 1098 faili = i - 1;
@@ -1158,10 +1158,10 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1158 struct btrfs_root *root = BTRFS_I(inode)->root; 1158 struct btrfs_root *root = BTRFS_I(inode)->root;
1159 struct page **pages = NULL; 1159 struct page **pages = NULL;
1160 unsigned long first_index; 1160 unsigned long first_index;
1161 unsigned long last_index;
1162 size_t num_written = 0; 1161 size_t num_written = 0;
1163 int nrptrs; 1162 int nrptrs;
1164 int ret = 0; 1163 int ret = 0;
1164 bool force_page_uptodate = false;
1165 1165
1166 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / 1166 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
1167 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / 1167 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1171,7 +1171,6 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1171 return -ENOMEM; 1171 return -ENOMEM;
1172 1172
1173 first_index = pos >> PAGE_CACHE_SHIFT; 1173 first_index = pos >> PAGE_CACHE_SHIFT;
1174 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
1175 1174
1176 while (iov_iter_count(i) > 0) { 1175 while (iov_iter_count(i) > 0) {
1177 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1176 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
@@ -1205,8 +1204,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1205 * contents of pages from loop to loop 1204 * contents of pages from loop to loop
1206 */ 1205 */
1207 ret = prepare_pages(root, file, pages, num_pages, 1206 ret = prepare_pages(root, file, pages, num_pages,
1208 pos, first_index, last_index, 1207 pos, first_index, write_bytes,
1209 write_bytes); 1208 force_page_uptodate);
1210 if (ret) { 1209 if (ret) {
1211 btrfs_delalloc_release_space(inode, 1210 btrfs_delalloc_release_space(inode,
1212 num_pages << PAGE_CACHE_SHIFT); 1211 num_pages << PAGE_CACHE_SHIFT);
@@ -1223,12 +1222,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1223 if (copied < write_bytes) 1222 if (copied < write_bytes)
1224 nrptrs = 1; 1223 nrptrs = 1;
1225 1224
1226 if (copied == 0) 1225 if (copied == 0) {
1226 force_page_uptodate = true;
1227 dirty_pages = 0; 1227 dirty_pages = 0;
1228 else 1228 } else {
1229 force_page_uptodate = false;
1229 dirty_pages = (copied + offset + 1230 dirty_pages = (copied + offset +
1230 PAGE_CACHE_SIZE - 1) >> 1231 PAGE_CACHE_SIZE - 1) >>
1231 PAGE_CACHE_SHIFT; 1232 PAGE_CACHE_SHIFT;
1233 }
1232 1234
1233 /* 1235 /*
1234 * If we had a short copy we need to release the excess delaloc 1236 * If we had a short copy we need to release the excess delaloc
@@ -1238,9 +1240,11 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
1238 * managed to copy. 1240 * managed to copy.
1239 */ 1241 */
1240 if (num_pages > dirty_pages) { 1242 if (num_pages > dirty_pages) {
1241 if (copied > 0) 1243 if (copied > 0) {
1242 atomic_inc( 1244 spin_lock(&BTRFS_I(inode)->lock);
1243 &BTRFS_I(inode)->outstanding_extents); 1245 BTRFS_I(inode)->outstanding_extents++;
1246 spin_unlock(&BTRFS_I(inode)->lock);
1247 }
1244 btrfs_delalloc_release_space(inode, 1248 btrfs_delalloc_release_space(inode,
1245 (num_pages - dirty_pages) << 1249 (num_pages - dirty_pages) <<
1246 PAGE_CACHE_SHIFT); 1250 PAGE_CACHE_SHIFT);
@@ -1336,6 +1340,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1336 struct inode *inode = fdentry(file)->d_inode; 1340 struct inode *inode = fdentry(file)->d_inode;
1337 struct btrfs_root *root = BTRFS_I(inode)->root; 1341 struct btrfs_root *root = BTRFS_I(inode)->root;
1338 loff_t *ppos = &iocb->ki_pos; 1342 loff_t *ppos = &iocb->ki_pos;
1343 u64 start_pos;
1339 ssize_t num_written = 0; 1344 ssize_t num_written = 0;
1340 ssize_t err = 0; 1345 ssize_t err = 0;
1341 size_t count, ocount; 1346 size_t count, ocount;
@@ -1384,6 +1389,15 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
1384 file_update_time(file); 1389 file_update_time(file);
1385 BTRFS_I(inode)->sequence++; 1390 BTRFS_I(inode)->sequence++;
1386 1391
1392 start_pos = round_down(pos, root->sectorsize);
1393 if (start_pos > i_size_read(inode)) {
1394 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
1395 if (err) {
1396 mutex_unlock(&inode->i_mutex);
1397 goto out;
1398 }
1399 }
1400
1387 if (unlikely(file->f_flags & O_DIRECT)) { 1401 if (unlikely(file->f_flags & O_DIRECT)) {
1388 num_written = __btrfs_direct_write(iocb, iov, nr_segs, 1402 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1389 pos, ppos, count, ocount); 1403 pos, ppos, count, ocount);
@@ -1452,7 +1466,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1452 * important optimization for directories because holding the mutex prevents 1466 * important optimization for directories because holding the mutex prevents
1453 * new operations on the dir while we write to disk. 1467 * new operations on the dir while we write to disk.
1454 */ 1468 */
1455int btrfs_sync_file(struct file *file, int datasync) 1469int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1456{ 1470{
1457 struct dentry *dentry = file->f_path.dentry; 1471 struct dentry *dentry = file->f_path.dentry;
1458 struct inode *inode = dentry->d_inode; 1472 struct inode *inode = dentry->d_inode;
@@ -1462,9 +1476,13 @@ int btrfs_sync_file(struct file *file, int datasync)
1462 1476
1463 trace_btrfs_sync_file(file, datasync); 1477 trace_btrfs_sync_file(file, datasync);
1464 1478
1479 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
1480 if (ret)
1481 return ret;
1482 mutex_lock(&inode->i_mutex);
1483
1465 /* we wait first, since the writeback may change the inode */ 1484 /* we wait first, since the writeback may change the inode */
1466 root->log_batch++; 1485 root->log_batch++;
1467 /* the VFS called filemap_fdatawrite for us */
1468 btrfs_wait_ordered_range(inode, 0, (u64)-1); 1486 btrfs_wait_ordered_range(inode, 0, (u64)-1);
1469 root->log_batch++; 1487 root->log_batch++;
1470 1488
@@ -1472,8 +1490,10 @@ int btrfs_sync_file(struct file *file, int datasync)
1472 * check the transaction that last modified this inode 1490 * check the transaction that last modified this inode
1473 * and see if its already been committed 1491 * and see if its already been committed
1474 */ 1492 */
1475 if (!BTRFS_I(inode)->last_trans) 1493 if (!BTRFS_I(inode)->last_trans) {
1494 mutex_unlock(&inode->i_mutex);
1476 goto out; 1495 goto out;
1496 }
1477 1497
1478 /* 1498 /*
1479 * if the last transaction that changed this file was before 1499 * if the last transaction that changed this file was before
@@ -1484,6 +1504,7 @@ int btrfs_sync_file(struct file *file, int datasync)
1484 if (BTRFS_I(inode)->last_trans <= 1504 if (BTRFS_I(inode)->last_trans <=
1485 root->fs_info->last_trans_committed) { 1505 root->fs_info->last_trans_committed) {
1486 BTRFS_I(inode)->last_trans = 0; 1506 BTRFS_I(inode)->last_trans = 0;
1507 mutex_unlock(&inode->i_mutex);
1487 goto out; 1508 goto out;
1488 } 1509 }
1489 1510
@@ -1496,12 +1517,15 @@ int btrfs_sync_file(struct file *file, int datasync)
1496 trans = btrfs_start_transaction(root, 0); 1517 trans = btrfs_start_transaction(root, 0);
1497 if (IS_ERR(trans)) { 1518 if (IS_ERR(trans)) {
1498 ret = PTR_ERR(trans); 1519 ret = PTR_ERR(trans);
1520 mutex_unlock(&inode->i_mutex);
1499 goto out; 1521 goto out;
1500 } 1522 }
1501 1523
1502 ret = btrfs_log_dentry_safe(trans, root, dentry); 1524 ret = btrfs_log_dentry_safe(trans, root, dentry);
1503 if (ret < 0) 1525 if (ret < 0) {
1526 mutex_unlock(&inode->i_mutex);
1504 goto out; 1527 goto out;
1528 }
1505 1529
1506 /* we've logged all the items and now have a consistent 1530 /* we've logged all the items and now have a consistent
1507 * version of the file in the log. It is possible that 1531 * version of the file in the log. It is possible that
@@ -1513,7 +1537,7 @@ int btrfs_sync_file(struct file *file, int datasync)
1513 * file again, but that will end up using the synchronization 1537 * file again, but that will end up using the synchronization
1514 * inside btrfs_sync_log to keep things safe. 1538 * inside btrfs_sync_log to keep things safe.
1515 */ 1539 */
1516 mutex_unlock(&dentry->d_inode->i_mutex); 1540 mutex_unlock(&inode->i_mutex);
1517 1541
1518 if (ret != BTRFS_NO_LOG_SYNC) { 1542 if (ret != BTRFS_NO_LOG_SYNC) {
1519 if (ret > 0) { 1543 if (ret > 0) {
@@ -1528,7 +1552,6 @@ int btrfs_sync_file(struct file *file, int datasync)
1528 } else { 1552 } else {
1529 ret = btrfs_end_transaction(trans, root); 1553 ret = btrfs_end_transaction(trans, root);
1530 } 1554 }
1531 mutex_lock(&dentry->d_inode->i_mutex);
1532out: 1555out:
1533 return ret > 0 ? -EIO : ret; 1556 return ret > 0 ? -EIO : ret;
1534} 1557}
@@ -1629,11 +1652,15 @@ static long btrfs_fallocate(struct file *file, int mode,
1629 1652
1630 cur_offset = alloc_start; 1653 cur_offset = alloc_start;
1631 while (1) { 1654 while (1) {
1655 u64 actual_end;
1656
1632 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 1657 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1633 alloc_end - cur_offset, 0); 1658 alloc_end - cur_offset, 0);
1634 BUG_ON(IS_ERR_OR_NULL(em)); 1659 BUG_ON(IS_ERR_OR_NULL(em));
1635 last_byte = min(extent_map_end(em), alloc_end); 1660 last_byte = min(extent_map_end(em), alloc_end);
1661 actual_end = min_t(u64, extent_map_end(em), offset + len);
1636 last_byte = (last_byte + mask) & ~mask; 1662 last_byte = (last_byte + mask) & ~mask;
1663
1637 if (em->block_start == EXTENT_MAP_HOLE || 1664 if (em->block_start == EXTENT_MAP_HOLE ||
1638 (cur_offset >= inode->i_size && 1665 (cur_offset >= inode->i_size &&
1639 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
@@ -1646,6 +1673,16 @@ static long btrfs_fallocate(struct file *file, int mode,
1646 free_extent_map(em); 1673 free_extent_map(em);
1647 break; 1674 break;
1648 } 1675 }
1676 } else if (actual_end > inode->i_size &&
1677 !(mode & FALLOC_FL_KEEP_SIZE)) {
1678 /*
1679 * We didn't need to allocate any more space, but we
1680 * still extended the size of the file so we need to
1681 * update i_size.
1682 */
1683 inode->i_ctime = CURRENT_TIME;
1684 i_size_write(inode, actual_end);
1685 btrfs_ordered_update_i_size(inode, actual_end, NULL);
1649 } 1686 }
1650 free_extent_map(em); 1687 free_extent_map(em);
1651 1688
@@ -1664,8 +1701,163 @@ out:
1664 return ret; 1701 return ret;
1665} 1702}
1666 1703
1704static int find_desired_extent(struct inode *inode, loff_t *offset, int origin)
1705{
1706 struct btrfs_root *root = BTRFS_I(inode)->root;
1707 struct extent_map *em;
1708 struct extent_state *cached_state = NULL;
1709 u64 lockstart = *offset;
1710 u64 lockend = i_size_read(inode);
1711 u64 start = *offset;
1712 u64 orig_start = *offset;
1713 u64 len = i_size_read(inode);
1714 u64 last_end = 0;
1715 int ret = 0;
1716
1717 lockend = max_t(u64, root->sectorsize, lockend);
1718 if (lockend <= lockstart)
1719 lockend = lockstart + root->sectorsize;
1720
1721 len = lockend - lockstart + 1;
1722
1723 len = max_t(u64, len, root->sectorsize);
1724 if (inode->i_size == 0)
1725 return -ENXIO;
1726
1727 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
1728 &cached_state, GFP_NOFS);
1729
1730 /*
1731 * Delalloc is such a pain. If we have a hole and we have pending
1732 * delalloc for a portion of the hole we will get back a hole that
1733 * exists for the entire range since it hasn't been actually written
1734 * yet. So to take care of this case we need to look for an extent just
1735 * before the position we want in case there is outstanding delalloc
1736 * going on here.
1737 */
1738 if (origin == SEEK_HOLE && start != 0) {
1739 if (start <= root->sectorsize)
1740 em = btrfs_get_extent_fiemap(inode, NULL, 0, 0,
1741 root->sectorsize, 0);
1742 else
1743 em = btrfs_get_extent_fiemap(inode, NULL, 0,
1744 start - root->sectorsize,
1745 root->sectorsize, 0);
1746 if (IS_ERR(em)) {
1747 ret = -ENXIO;
1748 goto out;
1749 }
1750 last_end = em->start + em->len;
1751 if (em->block_start == EXTENT_MAP_DELALLOC)
1752 last_end = min_t(u64, last_end, inode->i_size);
1753 free_extent_map(em);
1754 }
1755
1756 while (1) {
1757 em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
1758 if (IS_ERR(em)) {
1759 ret = -ENXIO;
1760 break;
1761 }
1762
1763 if (em->block_start == EXTENT_MAP_HOLE) {
1764 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
1765 if (last_end <= orig_start) {
1766 free_extent_map(em);
1767 ret = -ENXIO;
1768 break;
1769 }
1770 }
1771
1772 if (origin == SEEK_HOLE) {
1773 *offset = start;
1774 free_extent_map(em);
1775 break;
1776 }
1777 } else {
1778 if (origin == SEEK_DATA) {
1779 if (em->block_start == EXTENT_MAP_DELALLOC) {
1780 if (start >= inode->i_size) {
1781 free_extent_map(em);
1782 ret = -ENXIO;
1783 break;
1784 }
1785 }
1786
1787 *offset = start;
1788 free_extent_map(em);
1789 break;
1790 }
1791 }
1792
1793 start = em->start + em->len;
1794 last_end = em->start + em->len;
1795
1796 if (em->block_start == EXTENT_MAP_DELALLOC)
1797 last_end = min_t(u64, last_end, inode->i_size);
1798
1799 if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) {
1800 free_extent_map(em);
1801 ret = -ENXIO;
1802 break;
1803 }
1804 free_extent_map(em);
1805 cond_resched();
1806 }
1807 if (!ret)
1808 *offset = min(*offset, inode->i_size);
1809out:
1810 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
1811 &cached_state, GFP_NOFS);
1812 return ret;
1813}
1814
1815static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
1816{
1817 struct inode *inode = file->f_mapping->host;
1818 int ret;
1819
1820 mutex_lock(&inode->i_mutex);
1821 switch (origin) {
1822 case SEEK_END:
1823 case SEEK_CUR:
1824 offset = generic_file_llseek_unlocked(file, offset, origin);
1825 goto out;
1826 case SEEK_DATA:
1827 case SEEK_HOLE:
1828 if (offset >= i_size_read(inode)) {
1829 mutex_unlock(&inode->i_mutex);
1830 return -ENXIO;
1831 }
1832
1833 ret = find_desired_extent(inode, &offset, origin);
1834 if (ret) {
1835 mutex_unlock(&inode->i_mutex);
1836 return ret;
1837 }
1838 }
1839
1840 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) {
1841 offset = -EINVAL;
1842 goto out;
1843 }
1844 if (offset > inode->i_sb->s_maxbytes) {
1845 offset = -EINVAL;
1846 goto out;
1847 }
1848
1849 /* Special lock needed here? */
1850 if (offset != file->f_pos) {
1851 file->f_pos = offset;
1852 file->f_version = 0;
1853 }
1854out:
1855 mutex_unlock(&inode->i_mutex);
1856 return offset;
1857}
1858
1667const struct file_operations btrfs_file_operations = { 1859const struct file_operations btrfs_file_operations = {
1668 .llseek = generic_file_llseek, 1860 .llseek = btrfs_file_llseek,
1669 .read = do_sync_read, 1861 .read = do_sync_read,
1670 .write = do_sync_write, 1862 .write = do_sync_write,
1671 .aio_read = generic_file_aio_read, 1863 .aio_read = generic_file_aio_read,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index bf0d61567f3..41ac927401d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -98,6 +98,12 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 98 return inode;
99 99
100 spin_lock(&block_group->lock); 100 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
102 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 }
106
101 if (!btrfs_fs_closing(root->fs_info)) { 107 if (!btrfs_fs_closing(root->fs_info)) {
102 block_group->inode = igrab(inode); 108 block_group->inode = igrab(inode);
103 block_group->iref = 1; 109 block_group->iref = 1;
@@ -135,7 +141,7 @@ int __create_free_space_inode(struct btrfs_root *root,
135 btrfs_set_inode_gid(leaf, inode_item, 0); 141 btrfs_set_inode_gid(leaf, inode_item, 0);
136 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
137 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
138 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM); 144 BTRFS_INODE_PREALLOC);
139 btrfs_set_inode_nlink(leaf, inode_item, 1); 145 btrfs_set_inode_nlink(leaf, inode_item, 1);
140 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 146 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
141 btrfs_set_inode_block_group(leaf, inode_item, offset); 147 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -184,9 +190,11 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
184 struct btrfs_path *path, 190 struct btrfs_path *path,
185 struct inode *inode) 191 struct inode *inode)
186{ 192{
193 struct btrfs_block_rsv *rsv;
187 loff_t oldsize; 194 loff_t oldsize;
188 int ret = 0; 195 int ret = 0;
189 196
197 rsv = trans->block_rsv;
190 trans->block_rsv = root->orphan_block_rsv; 198 trans->block_rsv = root->orphan_block_rsv;
191 ret = btrfs_block_rsv_check(trans, root, 199 ret = btrfs_block_rsv_check(trans, root,
192 root->orphan_block_rsv, 200 root->orphan_block_rsv,
@@ -204,6 +212,8 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
204 */ 212 */
205 ret = btrfs_truncate_inode_items(trans, root, inode, 213 ret = btrfs_truncate_inode_items(trans, root, inode,
206 0, BTRFS_EXTENT_DATA_KEY); 214 0, BTRFS_EXTENT_DATA_KEY);
215
216 trans->block_rsv = rsv;
207 if (ret) { 217 if (ret) {
208 WARN_ON(1); 218 WARN_ON(1);
209 return ret; 219 return ret;
@@ -239,17 +249,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
239 struct btrfs_free_space_header *header; 249 struct btrfs_free_space_header *header;
240 struct extent_buffer *leaf; 250 struct extent_buffer *leaf;
241 struct page *page; 251 struct page *page;
242 u32 *checksums = NULL, *crc;
243 char *disk_crcs = NULL;
244 struct btrfs_key key; 252 struct btrfs_key key;
245 struct list_head bitmaps; 253 struct list_head bitmaps;
246 u64 num_entries; 254 u64 num_entries;
247 u64 num_bitmaps; 255 u64 num_bitmaps;
248 u64 generation; 256 u64 generation;
249 u32 cur_crc = ~(u32)0;
250 pgoff_t index = 0; 257 pgoff_t index = 0;
251 unsigned long first_page_offset;
252 int num_checksums;
253 int ret = 0; 258 int ret = 0;
254 259
255 INIT_LIST_HEAD(&bitmaps); 260 INIT_LIST_HEAD(&bitmaps);
@@ -292,16 +297,6 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
292 if (!num_entries) 297 if (!num_entries)
293 goto out; 298 goto out;
294 299
295 /* Setup everything for doing checksumming */
296 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
297 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
298 if (!checksums)
299 goto out;
300 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
301 disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
302 if (!disk_crcs)
303 goto out;
304
305 ret = readahead_cache(inode); 300 ret = readahead_cache(inode);
306 if (ret) 301 if (ret)
307 goto out; 302 goto out;
@@ -311,18 +306,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
311 struct btrfs_free_space *e; 306 struct btrfs_free_space *e;
312 void *addr; 307 void *addr;
313 unsigned long offset = 0; 308 unsigned long offset = 0;
314 unsigned long start_offset = 0;
315 int need_loop = 0; 309 int need_loop = 0;
316 310
317 if (!num_entries && !num_bitmaps) 311 if (!num_entries && !num_bitmaps)
318 break; 312 break;
319 313
320 if (index == 0) { 314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
321 start_offset = first_page_offset;
322 offset = start_offset;
323 }
324
325 page = grab_cache_page(inode->i_mapping, index);
326 if (!page) 315 if (!page)
327 goto free_cache; 316 goto free_cache;
328 317
@@ -342,8 +331,15 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
342 if (index == 0) { 331 if (index == 0) {
343 u64 *gen; 332 u64 *gen;
344 333
345 memcpy(disk_crcs, addr, first_page_offset); 334 /*
346 gen = addr + (sizeof(u32) * num_checksums); 335 * We put a bogus crc in the front of the first page in
336 * case old kernels try to mount a fs with the new
337 * format to make sure they discard the cache.
338 */
339 addr += sizeof(u64);
340 offset += sizeof(u64);
341
342 gen = addr;
347 if (*gen != BTRFS_I(inode)->generation) { 343 if (*gen != BTRFS_I(inode)->generation) {
348 printk(KERN_ERR "btrfs: space cache generation" 344 printk(KERN_ERR "btrfs: space cache generation"
349 " (%llu) does not match inode (%llu)\n", 345 " (%llu) does not match inode (%llu)\n",
@@ -355,24 +351,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
355 page_cache_release(page); 351 page_cache_release(page);
356 goto free_cache; 352 goto free_cache;
357 } 353 }
358 crc = (u32 *)disk_crcs; 354 addr += sizeof(u64);
355 offset += sizeof(u64);
359 } 356 }
360 entry = addr + start_offset; 357 entry = addr;
361
362 /* First lets check our crc before we do anything fun */
363 cur_crc = ~(u32)0;
364 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
365 PAGE_CACHE_SIZE - start_offset);
366 btrfs_csum_final(cur_crc, (char *)&cur_crc);
367 if (cur_crc != *crc) {
368 printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
369 index);
370 kunmap(page);
371 unlock_page(page);
372 page_cache_release(page);
373 goto free_cache;
374 }
375 crc++;
376 358
377 while (1) { 359 while (1) {
378 if (!num_entries) 360 if (!num_entries)
@@ -470,8 +452,6 @@ next:
470 452
471 ret = 1; 453 ret = 1;
472out: 454out:
473 kfree(checksums);
474 kfree(disk_crcs);
475 return ret; 455 return ret;
476free_cache: 456free_cache:
477 __btrfs_remove_free_space_cache(ctl); 457 __btrfs_remove_free_space_cache(ctl);
@@ -569,8 +549,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
569 struct btrfs_key key; 549 struct btrfs_key key;
570 u64 start, end, len; 550 u64 start, end, len;
571 u64 bytes = 0; 551 u64 bytes = 0;
572 u32 *crc, *checksums; 552 u32 crc = ~(u32)0;
573 unsigned long first_page_offset;
574 int index = 0, num_pages = 0; 553 int index = 0, num_pages = 0;
575 int entries = 0; 554 int entries = 0;
576 int bitmaps = 0; 555 int bitmaps = 0;
@@ -590,34 +569,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
590 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
591 PAGE_CACHE_SHIFT; 570 PAGE_CACHE_SHIFT;
592 571
593 /* Since the first page has all of our checksums and our generation we
594 * need to calculate the offset into the page that we can start writing
595 * our entries.
596 */
597 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
598
599 filemap_write_and_wait(inode->i_mapping); 572 filemap_write_and_wait(inode->i_mapping);
600 btrfs_wait_ordered_range(inode, inode->i_size & 573 btrfs_wait_ordered_range(inode, inode->i_size &
601 ~(root->sectorsize - 1), (u64)-1); 574 ~(root->sectorsize - 1), (u64)-1);
602 575
603 /* make sure we don't overflow that first page */
604 if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
605 /* this is really the same as running out of space, where we also return 0 */
606 printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
607 ret = 0;
608 goto out_update;
609 }
610
611 /* We need a checksum per page. */
612 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
613 if (!crc)
614 return -1;
615
616 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS); 576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
617 if (!pages) { 577 if (!pages)
618 kfree(crc);
619 return -1; 578 return -1;
620 }
621 579
622 /* Get the cluster for this block_group if it exists */ 580 /* Get the cluster for this block_group if it exists */
623 if (block_group && !list_empty(&block_group->cluster_list)) 581 if (block_group && !list_empty(&block_group->cluster_list))
@@ -640,7 +598,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
640 * know and don't freak out. 598 * know and don't freak out.
641 */ 599 */
642 while (index < num_pages) { 600 while (index < num_pages) {
643 page = grab_cache_page(inode->i_mapping, index); 601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
644 if (!page) { 602 if (!page) {
645 int i; 603 int i;
646 604
@@ -648,7 +606,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
648 unlock_page(pages[i]); 606 unlock_page(pages[i]);
649 page_cache_release(pages[i]); 607 page_cache_release(pages[i]);
650 } 608 }
651 goto out_free; 609 goto out;
652 } 610 }
653 pages[index] = page; 611 pages[index] = page;
654 index++; 612 index++;
@@ -668,17 +626,11 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
668 /* Write out the extent entries */ 626 /* Write out the extent entries */
669 do { 627 do {
670 struct btrfs_free_space_entry *entry; 628 struct btrfs_free_space_entry *entry;
671 void *addr; 629 void *addr, *orig;
672 unsigned long offset = 0; 630 unsigned long offset = 0;
673 unsigned long start_offset = 0;
674 631
675 next_page = false; 632 next_page = false;
676 633
677 if (index == 0) {
678 start_offset = first_page_offset;
679 offset = start_offset;
680 }
681
682 if (index >= num_pages) { 634 if (index >= num_pages) {
683 out_of_space = true; 635 out_of_space = true;
684 break; 636 break;
@@ -686,10 +638,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
686 638
687 page = pages[index]; 639 page = pages[index];
688 640
689 addr = kmap(page); 641 orig = addr = kmap(page);
690 entry = addr + start_offset; 642 if (index == 0) {
643 u64 *gen;
691 644
692 memset(addr, 0, PAGE_CACHE_SIZE); 645 /*
646 * We're going to put in a bogus crc for this page to
647 * make sure that old kernels who aren't aware of this
648 * format will be sure to discard the cache.
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652
653 gen = addr;
654 *gen = trans->transid;
655 addr += sizeof(u64);
656 offset += sizeof(u64);
657 }
658 entry = addr;
659
660 memset(addr, 0, PAGE_CACHE_SIZE - offset);
693 while (node && !next_page) { 661 while (node && !next_page) {
694 struct btrfs_free_space *e; 662 struct btrfs_free_space *e;
695 663
@@ -752,13 +720,19 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
752 next_page = true; 720 next_page = true;
753 entry++; 721 entry++;
754 } 722 }
755 *crc = ~(u32)0;
756 *crc = btrfs_csum_data(root, addr + start_offset, *crc,
757 PAGE_CACHE_SIZE - start_offset);
758 kunmap(page);
759 723
760 btrfs_csum_final(*crc, (char *)crc); 724 /* Generate bogus crc value */
761 crc++; 725 if (index == 0) {
726 u32 *tmp;
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734
735 kunmap(page);
762 736
763 bytes += PAGE_CACHE_SIZE; 737 bytes += PAGE_CACHE_SIZE;
764 738
@@ -779,11 +753,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
779 753
780 addr = kmap(page); 754 addr = kmap(page);
781 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE); 755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
782 *crc = ~(u32)0;
783 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
784 kunmap(page); 756 kunmap(page);
785 btrfs_csum_final(*crc, (char *)crc);
786 crc++;
787 bytes += PAGE_CACHE_SIZE; 757 bytes += PAGE_CACHE_SIZE;
788 758
789 list_del_init(&entry->list); 759 list_del_init(&entry->list);
@@ -796,7 +766,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
796 i_size_read(inode) - 1, &cached_state, 766 i_size_read(inode) - 1, &cached_state,
797 GFP_NOFS); 767 GFP_NOFS);
798 ret = 0; 768 ret = 0;
799 goto out_free; 769 goto out;
800 } 770 }
801 771
802 /* Zero out the rest of the pages just to make sure */ 772 /* Zero out the rest of the pages just to make sure */
@@ -811,20 +781,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
811 index++; 781 index++;
812 } 782 }
813 783
814 /* Write the checksums and trans id to the first page */
815 {
816 void *addr;
817 u64 *gen;
818
819 page = pages[0];
820
821 addr = kmap(page);
822 memcpy(addr, checksums, sizeof(u32) * num_pages);
823 gen = addr + (sizeof(u32) * num_pages);
824 *gen = trans->transid;
825 kunmap(page);
826 }
827
828 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, 784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
829 bytes, &cached_state); 785 bytes, &cached_state);
830 btrfs_drop_pages(pages, num_pages); 786 btrfs_drop_pages(pages, num_pages);
@@ -833,7 +789,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 789
834 if (ret) { 790 if (ret) {
835 ret = 0; 791 ret = 0;
836 goto out_free; 792 goto out;
837 } 793 }
838 794
839 BTRFS_I(inode)->generation = trans->transid; 795 BTRFS_I(inode)->generation = trans->transid;
@@ -850,7 +806,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
850 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
851 EXTENT_DIRTY | EXTENT_DELALLOC | 807 EXTENT_DIRTY | EXTENT_DELALLOC |
852 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS); 808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
853 goto out_free; 809 goto out;
854 } 810 }
855 leaf = path->nodes[0]; 811 leaf = path->nodes[0];
856 if (ret > 0) { 812 if (ret > 0) {
@@ -866,7 +822,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
866 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 822 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
867 GFP_NOFS); 823 GFP_NOFS);
868 btrfs_release_path(path); 824 btrfs_release_path(path);
869 goto out_free; 825 goto out;
870 } 826 }
871 } 827 }
872 header = btrfs_item_ptr(leaf, path->slots[0], 828 header = btrfs_item_ptr(leaf, path->slots[0],
@@ -879,11 +835,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
879 835
880 ret = 1; 836 ret = 1;
881 837
882out_free: 838out:
883 kfree(checksums);
884 kfree(pages); 839 kfree(pages);
885
886out_update:
887 if (ret != 1) { 840 if (ret != 1) {
888 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 841 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
889 BTRFS_I(inode)->generation = 0; 842 BTRFS_I(inode)->generation = 0;
@@ -1219,9 +1172,9 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1219 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1172 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
1220} 1173}
1221 1174
1222static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, 1175static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1223 struct btrfs_free_space *info, u64 offset, 1176 struct btrfs_free_space *info,
1224 u64 bytes) 1177 u64 offset, u64 bytes)
1225{ 1178{
1226 unsigned long start, count; 1179 unsigned long start, count;
1227 1180
@@ -1232,6 +1185,13 @@ static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1232 bitmap_clear(info->bitmap, start, count); 1185 bitmap_clear(info->bitmap, start, count);
1233 1186
1234 info->bytes -= bytes; 1187 info->bytes -= bytes;
1188}
1189
1190static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1191 struct btrfs_free_space *info, u64 offset,
1192 u64 bytes)
1193{
1194 __bitmap_clear_bits(ctl, info, offset, bytes);
1235 ctl->free_space -= bytes; 1195 ctl->free_space -= bytes;
1236} 1196}
1237 1197
@@ -2035,7 +1995,7 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2035 return 0; 1995 return 0;
2036 1996
2037 ret = search_start; 1997 ret = search_start;
2038 bitmap_clear_bits(ctl, entry, ret, bytes); 1998 __bitmap_clear_bits(ctl, entry, ret, bytes);
2039 1999
2040 return ret; 2000 return ret;
2041} 2001}
@@ -2090,7 +2050,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2090 continue; 2050 continue;
2091 } 2051 }
2092 } else { 2052 } else {
2093
2094 ret = entry->offset; 2053 ret = entry->offset;
2095 2054
2096 entry->offset += bytes; 2055 entry->offset += bytes;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3601f0aebdd..b2d004ad66a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -750,15 +750,6 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
750 return alloc_hint; 750 return alloc_hint;
751} 751}
752 752
753static inline bool is_free_space_inode(struct btrfs_root *root,
754 struct inode *inode)
755{
756 if (root == root->fs_info->tree_root ||
757 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
758 return true;
759 return false;
760}
761
762/* 753/*
763 * when extent_io.c finds a delayed allocation range in the file, 754 * when extent_io.c finds a delayed allocation range in the file,
764 * the call backs end up in this code. The basic idea is to 755 * the call backs end up in this code. The basic idea is to
@@ -791,7 +782,7 @@ static noinline int cow_file_range(struct inode *inode,
791 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 782 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
792 int ret = 0; 783 int ret = 0;
793 784
794 BUG_ON(is_free_space_inode(root, inode)); 785 BUG_ON(btrfs_is_free_space_inode(root, inode));
795 trans = btrfs_join_transaction(root); 786 trans = btrfs_join_transaction(root);
796 BUG_ON(IS_ERR(trans)); 787 BUG_ON(IS_ERR(trans));
797 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 788 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -1070,9 +1061,10 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1070 u64 ino = btrfs_ino(inode); 1061 u64 ino = btrfs_ino(inode);
1071 1062
1072 path = btrfs_alloc_path(); 1063 path = btrfs_alloc_path();
1073 BUG_ON(!path); 1064 if (!path)
1065 return -ENOMEM;
1074 1066
1075 nolock = is_free_space_inode(root, inode); 1067 nolock = btrfs_is_free_space_inode(root, inode);
1076 1068
1077 if (nolock) 1069 if (nolock)
1078 trans = btrfs_join_transaction_nolock(root); 1070 trans = btrfs_join_transaction_nolock(root);
@@ -1291,15 +1283,16 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1291 return ret; 1283 return ret;
1292} 1284}
1293 1285
1294static int btrfs_split_extent_hook(struct inode *inode, 1286static void btrfs_split_extent_hook(struct inode *inode,
1295 struct extent_state *orig, u64 split) 1287 struct extent_state *orig, u64 split)
1296{ 1288{
1297 /* not delalloc, ignore it */ 1289 /* not delalloc, ignore it */
1298 if (!(orig->state & EXTENT_DELALLOC)) 1290 if (!(orig->state & EXTENT_DELALLOC))
1299 return 0; 1291 return;
1300 1292
1301 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1293 spin_lock(&BTRFS_I(inode)->lock);
1302 return 0; 1294 BTRFS_I(inode)->outstanding_extents++;
1295 spin_unlock(&BTRFS_I(inode)->lock);
1303} 1296}
1304 1297
1305/* 1298/*
@@ -1308,16 +1301,17 @@ static int btrfs_split_extent_hook(struct inode *inode,
1308 * extents, such as when we are doing sequential writes, so we can properly 1301 * extents, such as when we are doing sequential writes, so we can properly
1309 * account for the metadata space we'll need. 1302 * account for the metadata space we'll need.
1310 */ 1303 */
1311static int btrfs_merge_extent_hook(struct inode *inode, 1304static void btrfs_merge_extent_hook(struct inode *inode,
1312 struct extent_state *new, 1305 struct extent_state *new,
1313 struct extent_state *other) 1306 struct extent_state *other)
1314{ 1307{
1315 /* not delalloc, ignore it */ 1308 /* not delalloc, ignore it */
1316 if (!(other->state & EXTENT_DELALLOC)) 1309 if (!(other->state & EXTENT_DELALLOC))
1317 return 0; 1310 return;
1318 1311
1319 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1312 spin_lock(&BTRFS_I(inode)->lock);
1320 return 0; 1313 BTRFS_I(inode)->outstanding_extents--;
1314 spin_unlock(&BTRFS_I(inode)->lock);
1321} 1315}
1322 1316
1323/* 1317/*
@@ -1325,8 +1319,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1325 * bytes in this file, and to maintain the list of inodes that 1319 * bytes in this file, and to maintain the list of inodes that
1326 * have pending delalloc work to be done. 1320 * have pending delalloc work to be done.
1327 */ 1321 */
1328static int btrfs_set_bit_hook(struct inode *inode, 1322static void btrfs_set_bit_hook(struct inode *inode,
1329 struct extent_state *state, int *bits) 1323 struct extent_state *state, int *bits)
1330{ 1324{
1331 1325
1332 /* 1326 /*
@@ -1337,12 +1331,15 @@ static int btrfs_set_bit_hook(struct inode *inode,
1337 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1331 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1338 struct btrfs_root *root = BTRFS_I(inode)->root; 1332 struct btrfs_root *root = BTRFS_I(inode)->root;
1339 u64 len = state->end + 1 - state->start; 1333 u64 len = state->end + 1 - state->start;
1340 bool do_list = !is_free_space_inode(root, inode); 1334 bool do_list = !btrfs_is_free_space_inode(root, inode);
1341 1335
1342 if (*bits & EXTENT_FIRST_DELALLOC) 1336 if (*bits & EXTENT_FIRST_DELALLOC) {
1343 *bits &= ~EXTENT_FIRST_DELALLOC; 1337 *bits &= ~EXTENT_FIRST_DELALLOC;
1344 else 1338 } else {
1345 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 1339 spin_lock(&BTRFS_I(inode)->lock);
1340 BTRFS_I(inode)->outstanding_extents++;
1341 spin_unlock(&BTRFS_I(inode)->lock);
1342 }
1346 1343
1347 spin_lock(&root->fs_info->delalloc_lock); 1344 spin_lock(&root->fs_info->delalloc_lock);
1348 BTRFS_I(inode)->delalloc_bytes += len; 1345 BTRFS_I(inode)->delalloc_bytes += len;
@@ -1353,14 +1350,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
1353 } 1350 }
1354 spin_unlock(&root->fs_info->delalloc_lock); 1351 spin_unlock(&root->fs_info->delalloc_lock);
1355 } 1352 }
1356 return 0;
1357} 1353}
1358 1354
1359/* 1355/*
1360 * extent_io.c clear_bit_hook, see set_bit_hook for why 1356 * extent_io.c clear_bit_hook, see set_bit_hook for why
1361 */ 1357 */
1362static int btrfs_clear_bit_hook(struct inode *inode, 1358static void btrfs_clear_bit_hook(struct inode *inode,
1363 struct extent_state *state, int *bits) 1359 struct extent_state *state, int *bits)
1364{ 1360{
1365 /* 1361 /*
1366 * set_bit and clear bit hooks normally require _irqsave/restore 1362 * set_bit and clear bit hooks normally require _irqsave/restore
@@ -1370,12 +1366,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1370 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1366 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1371 struct btrfs_root *root = BTRFS_I(inode)->root; 1367 struct btrfs_root *root = BTRFS_I(inode)->root;
1372 u64 len = state->end + 1 - state->start; 1368 u64 len = state->end + 1 - state->start;
1373 bool do_list = !is_free_space_inode(root, inode); 1369 bool do_list = !btrfs_is_free_space_inode(root, inode);
1374 1370
1375 if (*bits & EXTENT_FIRST_DELALLOC) 1371 if (*bits & EXTENT_FIRST_DELALLOC) {
1376 *bits &= ~EXTENT_FIRST_DELALLOC; 1372 *bits &= ~EXTENT_FIRST_DELALLOC;
1377 else if (!(*bits & EXTENT_DO_ACCOUNTING)) 1373 } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1378 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 1374 spin_lock(&BTRFS_I(inode)->lock);
1375 BTRFS_I(inode)->outstanding_extents--;
1376 spin_unlock(&BTRFS_I(inode)->lock);
1377 }
1379 1378
1380 if (*bits & EXTENT_DO_ACCOUNTING) 1379 if (*bits & EXTENT_DO_ACCOUNTING)
1381 btrfs_delalloc_release_metadata(inode, len); 1380 btrfs_delalloc_release_metadata(inode, len);
@@ -1394,7 +1393,6 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1394 } 1393 }
1395 spin_unlock(&root->fs_info->delalloc_lock); 1394 spin_unlock(&root->fs_info->delalloc_lock);
1396 } 1395 }
1397 return 0;
1398} 1396}
1399 1397
1400/* 1398/*
@@ -1477,7 +1475,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1477 1475
1478 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1476 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1479 1477
1480 if (is_free_space_inode(root, inode)) 1478 if (btrfs_is_free_space_inode(root, inode))
1481 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2); 1479 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
1482 else 1480 else
1483 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1481 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
@@ -1644,7 +1642,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1644 int ret; 1642 int ret;
1645 1643
1646 path = btrfs_alloc_path(); 1644 path = btrfs_alloc_path();
1647 BUG_ON(!path); 1645 if (!path)
1646 return -ENOMEM;
1648 1647
1649 path->leave_spinning = 1; 1648 path->leave_spinning = 1;
1650 1649
@@ -1726,7 +1725,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1726 return 0; 1725 return 0;
1727 BUG_ON(!ordered_extent); 1726 BUG_ON(!ordered_extent);
1728 1727
1729 nolock = is_free_space_inode(root, inode); 1728 nolock = btrfs_is_free_space_inode(root, inode);
1730 1729
1731 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1730 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1732 BUG_ON(!list_empty(&ordered_extent->list)); 1731 BUG_ON(!list_empty(&ordered_extent->list));
@@ -1787,7 +1786,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1787 &ordered_extent->list); 1786 &ordered_extent->list);
1788 1787
1789 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1790 if (!ret) { 1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1791 ret = btrfs_update_inode(trans, root, inode); 1790 ret = btrfs_update_inode(trans, root, inode);
1792 BUG_ON(ret); 1791 BUG_ON(ret);
1793 } 1792 }
@@ -2214,7 +2213,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2214 2213
2215 if (!root->orphan_block_rsv) { 2214 if (!root->orphan_block_rsv) {
2216 block_rsv = btrfs_alloc_block_rsv(root); 2215 block_rsv = btrfs_alloc_block_rsv(root);
2217 BUG_ON(!block_rsv); 2216 if (!block_rsv)
2217 return -ENOMEM;
2218 } 2218 }
2219 2219
2220 spin_lock(&root->orphan_lock); 2220 spin_lock(&root->orphan_lock);
@@ -2516,7 +2516,9 @@ static void btrfs_read_locked_inode(struct inode *inode)
2516 filled = true; 2516 filled = true;
2517 2517
2518 path = btrfs_alloc_path(); 2518 path = btrfs_alloc_path();
2519 BUG_ON(!path); 2519 if (!path)
2520 goto make_bad;
2521
2520 path->leave_spinning = 1; 2522 path->leave_spinning = 1;
2521 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2523 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
2522 2524
@@ -2531,13 +2533,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
2531 2533
2532 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2534 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2533 struct btrfs_inode_item); 2535 struct btrfs_inode_item);
2534 if (!leaf->map_token)
2535 map_private_extent_buffer(leaf, (unsigned long)inode_item,
2536 sizeof(struct btrfs_inode_item),
2537 &leaf->map_token, &leaf->kaddr,
2538 &leaf->map_start, &leaf->map_len,
2539 KM_USER1);
2540
2541 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2536 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2542 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2537 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
2543 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2538 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
@@ -2575,11 +2570,6 @@ cache_acl:
2575 if (!maybe_acls) 2570 if (!maybe_acls)
2576 cache_no_acl(inode); 2571 cache_no_acl(inode);
2577 2572
2578 if (leaf->map_token) {
2579 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2580 leaf->map_token = NULL;
2581 }
2582
2583 btrfs_free_path(path); 2573 btrfs_free_path(path);
2584 2574
2585 switch (inode->i_mode & S_IFMT) { 2575 switch (inode->i_mode & S_IFMT) {
@@ -2624,13 +2614,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2624 struct btrfs_inode_item *item, 2614 struct btrfs_inode_item *item,
2625 struct inode *inode) 2615 struct inode *inode)
2626{ 2616{
2627 if (!leaf->map_token)
2628 map_private_extent_buffer(leaf, (unsigned long)item,
2629 sizeof(struct btrfs_inode_item),
2630 &leaf->map_token, &leaf->kaddr,
2631 &leaf->map_start, &leaf->map_len,
2632 KM_USER1);
2633
2634 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2617 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2635 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2618 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2636 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2619 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2659,11 +2642,6 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2659 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2642 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2660 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2643 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2661 btrfs_set_inode_block_group(leaf, item, 0); 2644 btrfs_set_inode_block_group(leaf, item, 0);
2662
2663 if (leaf->map_token) {
2664 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2665 leaf->map_token = NULL;
2666 }
2667} 2645}
2668 2646
2669/* 2647/*
@@ -2684,7 +2662,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2684 * The data relocation inode should also be directly updated 2662 * The data relocation inode should also be directly updated
2685 * without delay 2663 * without delay
2686 */ 2664 */
2687 if (!is_free_space_inode(root, inode) 2665 if (!btrfs_is_free_space_inode(root, inode)
2688 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { 2666 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2689 ret = btrfs_delayed_update_inode(trans, root, inode); 2667 ret = btrfs_delayed_update_inode(trans, root, inode);
2690 if (!ret) 2668 if (!ret)
@@ -3021,13 +2999,16 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3021 2999
3022 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3000 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3023 dentry->d_name.name, dentry->d_name.len); 3001 dentry->d_name.name, dentry->d_name.len);
3024 BUG_ON(ret); 3002 if (ret)
3003 goto out;
3025 3004
3026 if (inode->i_nlink == 0) { 3005 if (inode->i_nlink == 0) {
3027 ret = btrfs_orphan_add(trans, inode); 3006 ret = btrfs_orphan_add(trans, inode);
3028 BUG_ON(ret); 3007 if (ret)
3008 goto out;
3029 } 3009 }
3030 3010
3011out:
3031 nr = trans->blocks_used; 3012 nr = trans->blocks_used;
3032 __unlink_end_trans(trans, root); 3013 __unlink_end_trans(trans, root);
3033 btrfs_btree_balance_dirty(root, nr); 3014 btrfs_btree_balance_dirty(root, nr);
@@ -3170,6 +3151,11 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3170 3151
3171 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3152 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3172 3153
3154 path = btrfs_alloc_path();
3155 if (!path)
3156 return -ENOMEM;
3157 path->reada = -1;
3158
3173 if (root->ref_cows || root == root->fs_info->tree_root) 3159 if (root->ref_cows || root == root->fs_info->tree_root)
3174 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3160 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
3175 3161
@@ -3182,10 +3168,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3182 if (min_type == 0 && root == BTRFS_I(inode)->root) 3168 if (min_type == 0 && root == BTRFS_I(inode)->root)
3183 btrfs_kill_delayed_inode_items(inode); 3169 btrfs_kill_delayed_inode_items(inode);
3184 3170
3185 path = btrfs_alloc_path();
3186 BUG_ON(!path);
3187 path->reada = -1;
3188
3189 key.objectid = ino; 3171 key.objectid = ino;
3190 key.offset = (u64)-1; 3172 key.offset = (u64)-1;
3191 key.type = (u8)-1; 3173 key.type = (u8)-1;
@@ -3398,7 +3380,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3398 3380
3399 ret = -ENOMEM; 3381 ret = -ENOMEM;
3400again: 3382again:
3401 page = grab_cache_page(mapping, index); 3383 page = find_or_create_page(mapping, index, GFP_NOFS);
3402 if (!page) { 3384 if (!page) {
3403 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3404 goto out; 3386 goto out;
@@ -3528,15 +3510,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3528 err = btrfs_drop_extents(trans, inode, cur_offset, 3510 err = btrfs_drop_extents(trans, inode, cur_offset,
3529 cur_offset + hole_size, 3511 cur_offset + hole_size,
3530 &hint_byte, 1); 3512 &hint_byte, 1);
3531 if (err) 3513 if (err) {
3514 btrfs_end_transaction(trans, root);
3532 break; 3515 break;
3516 }
3533 3517
3534 err = btrfs_insert_file_extent(trans, root, 3518 err = btrfs_insert_file_extent(trans, root,
3535 btrfs_ino(inode), cur_offset, 0, 3519 btrfs_ino(inode), cur_offset, 0,
3536 0, hole_size, 0, hole_size, 3520 0, hole_size, 0, hole_size,
3537 0, 0, 0); 3521 0, 0, 0);
3538 if (err) 3522 if (err) {
3523 btrfs_end_transaction(trans, root);
3539 break; 3524 break;
3525 }
3540 3526
3541 btrfs_drop_extent_cache(inode, hole_start, 3527 btrfs_drop_extent_cache(inode, hole_start,
3542 last_byte - 1, 0); 3528 last_byte - 1, 0);
@@ -3634,7 +3620,7 @@ void btrfs_evict_inode(struct inode *inode)
3634 3620
3635 truncate_inode_pages(&inode->i_data, 0); 3621 truncate_inode_pages(&inode->i_data, 0);
3636 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 || 3622 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3637 is_free_space_inode(root, inode))) 3623 btrfs_is_free_space_inode(root, inode)))
3638 goto no_delete; 3624 goto no_delete;
3639 3625
3640 if (is_bad_inode(inode)) { 3626 if (is_bad_inode(inode)) {
@@ -3713,7 +3699,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
3713 int ret = 0; 3699 int ret = 0;
3714 3700
3715 path = btrfs_alloc_path(); 3701 path = btrfs_alloc_path();
3716 BUG_ON(!path); 3702 if (!path)
3703 return -ENOMEM;
3717 3704
3718 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name, 3705 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
3719 namelen, 0); 3706 namelen, 0);
@@ -3978,10 +3965,16 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3978 BTRFS_I(inode)->root = root; 3965 BTRFS_I(inode)->root = root;
3979 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3966 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
3980 btrfs_read_locked_inode(inode); 3967 btrfs_read_locked_inode(inode);
3981 inode_tree_add(inode); 3968 if (!is_bad_inode(inode)) {
3982 unlock_new_inode(inode); 3969 inode_tree_add(inode);
3983 if (new) 3970 unlock_new_inode(inode);
3984 *new = 1; 3971 if (new)
3972 *new = 1;
3973 } else {
3974 unlock_new_inode(inode);
3975 iput(inode);
3976 inode = ERR_PTR(-ESTALE);
3977 }
3985 } 3978 }
3986 3979
3987 return inode; 3980 return inode;
@@ -4016,12 +4009,20 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4016 struct btrfs_root *sub_root = root; 4009 struct btrfs_root *sub_root = root;
4017 struct btrfs_key location; 4010 struct btrfs_key location;
4018 int index; 4011 int index;
4019 int ret; 4012 int ret = 0;
4020 4013
4021 if (dentry->d_name.len > BTRFS_NAME_LEN) 4014 if (dentry->d_name.len > BTRFS_NAME_LEN)
4022 return ERR_PTR(-ENAMETOOLONG); 4015 return ERR_PTR(-ENAMETOOLONG);
4023 4016
4024 ret = btrfs_inode_by_name(dir, dentry, &location); 4017 if (unlikely(d_need_lookup(dentry))) {
4018 memcpy(&location, dentry->d_fsdata, sizeof(struct btrfs_key));
4019 kfree(dentry->d_fsdata);
4020 dentry->d_fsdata = NULL;
4021 /* This thing is hashed, drop it for now */
4022 d_drop(dentry);
4023 } else {
4024 ret = btrfs_inode_by_name(dir, dentry, &location);
4025 }
4025 4026
4026 if (ret < 0) 4027 if (ret < 0)
4027 return ERR_PTR(ret); 4028 return ERR_PTR(ret);
@@ -4076,16 +4077,24 @@ static int btrfs_dentry_delete(const struct dentry *dentry)
4076 return 0; 4077 return 0;
4077} 4078}
4078 4079
4080static void btrfs_dentry_release(struct dentry *dentry)
4081{
4082 if (dentry->d_fsdata)
4083 kfree(dentry->d_fsdata);
4084}
4085
4079static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, 4086static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4080 struct nameidata *nd) 4087 struct nameidata *nd)
4081{ 4088{
4082 struct inode *inode; 4089 struct dentry *ret;
4083
4084 inode = btrfs_lookup_dentry(dir, dentry);
4085 if (IS_ERR(inode))
4086 return ERR_CAST(inode);
4087 4090
4088 return d_splice_alias(inode, dentry); 4091 ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4092 if (unlikely(d_need_lookup(dentry))) {
4093 spin_lock(&dentry->d_lock);
4094 dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
4095 spin_unlock(&dentry->d_lock);
4096 }
4097 return ret;
4089} 4098}
4090 4099
4091unsigned char btrfs_filetype_table[] = { 4100unsigned char btrfs_filetype_table[] = {
@@ -4104,6 +4113,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4104 struct btrfs_path *path; 4113 struct btrfs_path *path;
4105 struct list_head ins_list; 4114 struct list_head ins_list;
4106 struct list_head del_list; 4115 struct list_head del_list;
4116 struct qstr q;
4107 int ret; 4117 int ret;
4108 struct extent_buffer *leaf; 4118 struct extent_buffer *leaf;
4109 int slot; 4119 int slot;
@@ -4124,7 +4134,8 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4124 4134
4125 /* special case for "." */ 4135 /* special case for "." */
4126 if (filp->f_pos == 0) { 4136 if (filp->f_pos == 0) {
4127 over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR); 4137 over = filldir(dirent, ".", 1,
4138 filp->f_pos, btrfs_ino(inode), DT_DIR);
4128 if (over) 4139 if (over)
4129 return 0; 4140 return 0;
4130 filp->f_pos = 1; 4141 filp->f_pos = 1;
@@ -4133,7 +4144,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4133 if (filp->f_pos == 1) { 4144 if (filp->f_pos == 1) {
4134 u64 pino = parent_ino(filp->f_path.dentry); 4145 u64 pino = parent_ino(filp->f_path.dentry);
4135 over = filldir(dirent, "..", 2, 4146 over = filldir(dirent, "..", 2,
4136 2, pino, DT_DIR); 4147 filp->f_pos, pino, DT_DIR);
4137 if (over) 4148 if (over)
4138 return 0; 4149 return 0;
4139 filp->f_pos = 2; 4150 filp->f_pos = 2;
@@ -4193,6 +4204,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4193 4204
4194 while (di_cur < di_total) { 4205 while (di_cur < di_total) {
4195 struct btrfs_key location; 4206 struct btrfs_key location;
4207 struct dentry *tmp;
4196 4208
4197 if (verify_dir_item(root, leaf, di)) 4209 if (verify_dir_item(root, leaf, di))
4198 break; 4210 break;
@@ -4213,6 +4225,33 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4213 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; 4225 d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
4214 btrfs_dir_item_key_to_cpu(leaf, di, &location); 4226 btrfs_dir_item_key_to_cpu(leaf, di, &location);
4215 4227
4228 q.name = name_ptr;
4229 q.len = name_len;
4230 q.hash = full_name_hash(q.name, q.len);
4231 tmp = d_lookup(filp->f_dentry, &q);
4232 if (!tmp) {
4233 struct btrfs_key *newkey;
4234
4235 newkey = kzalloc(sizeof(struct btrfs_key),
4236 GFP_NOFS);
4237 if (!newkey)
4238 goto no_dentry;
4239 tmp = d_alloc(filp->f_dentry, &q);
4240 if (!tmp) {
4241 kfree(newkey);
4242 dput(tmp);
4243 goto no_dentry;
4244 }
4245 memcpy(newkey, &location,
4246 sizeof(struct btrfs_key));
4247 tmp->d_fsdata = newkey;
4248 tmp->d_flags |= DCACHE_NEED_LOOKUP;
4249 d_rehash(tmp);
4250 dput(tmp);
4251 } else {
4252 dput(tmp);
4253 }
4254no_dentry:
4216 /* is this a reference to our own snapshot? If so 4255 /* is this a reference to our own snapshot? If so
4217 * skip it 4256 * skip it
4218 */ 4257 */
@@ -4277,7 +4316,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4277 if (BTRFS_I(inode)->dummy_inode) 4316 if (BTRFS_I(inode)->dummy_inode)
4278 return 0; 4317 return 0;
4279 4318
4280 if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode)) 4319 if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
4281 nolock = true; 4320 nolock = true;
4282 4321
4283 if (wbc->sync_mode == WB_SYNC_ALL) { 4322 if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4438,7 +4477,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4438 int owner; 4477 int owner;
4439 4478
4440 path = btrfs_alloc_path(); 4479 path = btrfs_alloc_path();
4441 BUG_ON(!path); 4480 if (!path)
4481 return ERR_PTR(-ENOMEM);
4442 4482
4443 inode = new_inode(root->fs_info->sb); 4483 inode = new_inode(root->fs_info->sb);
4444 if (!inode) { 4484 if (!inode) {
@@ -4473,7 +4513,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4473 inode->i_generation = BTRFS_I(inode)->generation; 4513 inode->i_generation = BTRFS_I(inode)->generation;
4474 btrfs_set_inode_space_info(root, inode); 4514 btrfs_set_inode_space_info(root, inode);
4475 4515
4476 if (mode & S_IFDIR) 4516 if (S_ISDIR(mode))
4477 owner = 0; 4517 owner = 0;
4478 else 4518 else
4479 owner = 1; 4519 owner = 1;
@@ -4518,7 +4558,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4518 4558
4519 btrfs_inherit_iflags(inode, dir); 4559 btrfs_inherit_iflags(inode, dir);
4520 4560
4521 if ((mode & S_IFREG)) { 4561 if (S_ISREG(mode)) {
4522 if (btrfs_test_opt(root, NODATASUM)) 4562 if (btrfs_test_opt(root, NODATASUM))
4523 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4563 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4524 if (btrfs_test_opt(root, NODATACOW) || 4564 if (btrfs_test_opt(root, NODATACOW) ||
@@ -4772,11 +4812,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4772 if (err) { 4812 if (err) {
4773 drop_inode = 1; 4813 drop_inode = 1;
4774 } else { 4814 } else {
4775 struct dentry *parent = dget_parent(dentry); 4815 struct dentry *parent = dentry->d_parent;
4776 err = btrfs_update_inode(trans, root, inode); 4816 err = btrfs_update_inode(trans, root, inode);
4777 BUG_ON(err); 4817 BUG_ON(err);
4778 btrfs_log_new_name(trans, inode, NULL, parent); 4818 btrfs_log_new_name(trans, inode, NULL, parent);
4779 dput(parent);
4780 } 4819 }
4781 4820
4782 nr = trans->blocks_used; 4821 nr = trans->blocks_used;
@@ -5794,7 +5833,7 @@ again:
5794 5833
5795 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5834 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5835 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5836 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5798 btrfs_update_inode(trans, root, inode); 5837 btrfs_update_inode(trans, root, inode);
5799 ret = 0; 5838 ret = 0;
5800out_unlock: 5839out_unlock:
@@ -6699,19 +6738,6 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
6699 return 0; 6738 return 0;
6700} 6739}
6701 6740
6702/* helper function for file defrag and space balancing. This
6703 * forces readahead on a given range of bytes in an inode
6704 */
6705unsigned long btrfs_force_ra(struct address_space *mapping,
6706 struct file_ra_state *ra, struct file *file,
6707 pgoff_t offset, pgoff_t last_index)
6708{
6709 pgoff_t req_size = last_index - offset + 1;
6710
6711 page_cache_sync_readahead(mapping, ra, file, offset, req_size);
6712 return offset + req_size;
6713}
6714
6715struct inode *btrfs_alloc_inode(struct super_block *sb) 6741struct inode *btrfs_alloc_inode(struct super_block *sb)
6716{ 6742{
6717 struct btrfs_inode *ei; 6743 struct btrfs_inode *ei;
@@ -6735,8 +6761,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6735 ei->index_cnt = (u64)-1; 6761 ei->index_cnt = (u64)-1;
6736 ei->last_unlink_trans = 0; 6762 ei->last_unlink_trans = 0;
6737 6763
6738 atomic_set(&ei->outstanding_extents, 0); 6764 spin_lock_init(&ei->lock);
6739 atomic_set(&ei->reserved_extents, 0); 6765 ei->outstanding_extents = 0;
6766 ei->reserved_extents = 0;
6740 6767
6741 ei->ordered_data_close = 0; 6768 ei->ordered_data_close = 0;
6742 ei->orphan_meta_reserved = 0; 6769 ei->orphan_meta_reserved = 0;
@@ -6774,8 +6801,8 @@ void btrfs_destroy_inode(struct inode *inode)
6774 6801
6775 WARN_ON(!list_empty(&inode->i_dentry)); 6802 WARN_ON(!list_empty(&inode->i_dentry));
6776 WARN_ON(inode->i_data.nrpages); 6803 WARN_ON(inode->i_data.nrpages);
6777 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6804 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6778 WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents)); 6805 WARN_ON(BTRFS_I(inode)->reserved_extents);
6779 6806
6780 /* 6807 /*
6781 * This can happen where we create an inode, but somebody else also 6808 * This can happen where we create an inode, but somebody else also
@@ -6830,7 +6857,7 @@ int btrfs_drop_inode(struct inode *inode)
6830 struct btrfs_root *root = BTRFS_I(inode)->root; 6857 struct btrfs_root *root = BTRFS_I(inode)->root;
6831 6858
6832 if (btrfs_root_refs(&root->root_item) == 0 && 6859 if (btrfs_root_refs(&root->root_item) == 0 &&
6833 !is_free_space_inode(root, inode)) 6860 !btrfs_is_free_space_inode(root, inode))
6834 return 1; 6861 return 1;
6835 else 6862 else
6836 return generic_drop_inode(inode); 6863 return generic_drop_inode(inode);
@@ -6900,7 +6927,7 @@ static int btrfs_getattr(struct vfsmount *mnt,
6900{ 6927{
6901 struct inode *inode = dentry->d_inode; 6928 struct inode *inode = dentry->d_inode;
6902 generic_fillattr(inode, stat); 6929 generic_fillattr(inode, stat);
6903 stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; 6930 stat->dev = BTRFS_I(inode)->root->anon_dev;
6904 stat->blksize = PAGE_CACHE_SIZE; 6931 stat->blksize = PAGE_CACHE_SIZE;
6905 stat->blocks = (inode_get_bytes(inode) + 6932 stat->blocks = (inode_get_bytes(inode) +
6906 BTRFS_I(inode)->delalloc_bytes) >> 9; 6933 BTRFS_I(inode)->delalloc_bytes) >> 9;
@@ -7068,9 +7095,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7068 BUG_ON(ret); 7095 BUG_ON(ret);
7069 7096
7070 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) { 7097 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
7071 struct dentry *parent = dget_parent(new_dentry); 7098 struct dentry *parent = new_dentry->d_parent;
7072 btrfs_log_new_name(trans, old_inode, old_dir, parent); 7099 btrfs_log_new_name(trans, old_inode, old_dir, parent);
7073 dput(parent);
7074 btrfs_end_log_trans(root); 7100 btrfs_end_log_trans(root);
7075 } 7101 }
7076out_fail: 7102out_fail:
@@ -7194,7 +7220,11 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
7194 goto out_unlock; 7220 goto out_unlock;
7195 7221
7196 path = btrfs_alloc_path(); 7222 path = btrfs_alloc_path();
7197 BUG_ON(!path); 7223 if (!path) {
7224 err = -ENOMEM;
7225 drop_inode = 1;
7226 goto out_unlock;
7227 }
7198 key.objectid = btrfs_ino(inode); 7228 key.objectid = btrfs_ino(inode);
7199 key.offset = 0; 7229 key.offset = 0;
7200 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7230 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
@@ -7331,15 +7361,19 @@ static int btrfs_set_page_dirty(struct page *page)
7331 return __set_page_dirty_nobuffers(page); 7361 return __set_page_dirty_nobuffers(page);
7332} 7362}
7333 7363
7334static int btrfs_permission(struct inode *inode, int mask, unsigned int flags) 7364static int btrfs_permission(struct inode *inode, int mask)
7335{ 7365{
7336 struct btrfs_root *root = BTRFS_I(inode)->root; 7366 struct btrfs_root *root = BTRFS_I(inode)->root;
7367 umode_t mode = inode->i_mode;
7337 7368
7338 if (btrfs_root_readonly(root) && (mask & MAY_WRITE)) 7369 if (mask & MAY_WRITE &&
7339 return -EROFS; 7370 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
7340 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7371 if (btrfs_root_readonly(root))
7341 return -EACCES; 7372 return -EROFS;
7342 return generic_permission(inode, mask, flags, btrfs_check_acl); 7373 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
7374 return -EACCES;
7375 }
7376 return generic_permission(inode, mask);
7343} 7377}
7344 7378
7345static const struct inode_operations btrfs_dir_inode_operations = { 7379static const struct inode_operations btrfs_dir_inode_operations = {
@@ -7359,10 +7393,12 @@ static const struct inode_operations btrfs_dir_inode_operations = {
7359 .listxattr = btrfs_listxattr, 7393 .listxattr = btrfs_listxattr,
7360 .removexattr = btrfs_removexattr, 7394 .removexattr = btrfs_removexattr,
7361 .permission = btrfs_permission, 7395 .permission = btrfs_permission,
7396 .get_acl = btrfs_get_acl,
7362}; 7397};
7363static const struct inode_operations btrfs_dir_ro_inode_operations = { 7398static const struct inode_operations btrfs_dir_ro_inode_operations = {
7364 .lookup = btrfs_lookup, 7399 .lookup = btrfs_lookup,
7365 .permission = btrfs_permission, 7400 .permission = btrfs_permission,
7401 .get_acl = btrfs_get_acl,
7366}; 7402};
7367 7403
7368static const struct file_operations btrfs_dir_file_operations = { 7404static const struct file_operations btrfs_dir_file_operations = {
@@ -7431,6 +7467,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
7431 .removexattr = btrfs_removexattr, 7467 .removexattr = btrfs_removexattr,
7432 .permission = btrfs_permission, 7468 .permission = btrfs_permission,
7433 .fiemap = btrfs_fiemap, 7469 .fiemap = btrfs_fiemap,
7470 .get_acl = btrfs_get_acl,
7434}; 7471};
7435static const struct inode_operations btrfs_special_inode_operations = { 7472static const struct inode_operations btrfs_special_inode_operations = {
7436 .getattr = btrfs_getattr, 7473 .getattr = btrfs_getattr,
@@ -7440,6 +7477,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
7440 .getxattr = btrfs_getxattr, 7477 .getxattr = btrfs_getxattr,
7441 .listxattr = btrfs_listxattr, 7478 .listxattr = btrfs_listxattr,
7442 .removexattr = btrfs_removexattr, 7479 .removexattr = btrfs_removexattr,
7480 .get_acl = btrfs_get_acl,
7443}; 7481};
7444static const struct inode_operations btrfs_symlink_inode_operations = { 7482static const struct inode_operations btrfs_symlink_inode_operations = {
7445 .readlink = generic_readlink, 7483 .readlink = generic_readlink,
@@ -7451,8 +7489,10 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7451 .getxattr = btrfs_getxattr, 7489 .getxattr = btrfs_getxattr,
7452 .listxattr = btrfs_listxattr, 7490 .listxattr = btrfs_listxattr,
7453 .removexattr = btrfs_removexattr, 7491 .removexattr = btrfs_removexattr,
7492 .get_acl = btrfs_get_acl,
7454}; 7493};
7455 7494
7456const struct dentry_operations btrfs_dentry_operations = { 7495const struct dentry_operations btrfs_dentry_operations = {
7457 .d_delete = btrfs_dentry_delete, 7496 .d_delete = btrfs_dentry_delete,
7497 .d_release = btrfs_dentry_release,
7458}; 7498};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a3c4751e07d..dae5dfe41ba 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -323,7 +323,7 @@ static noinline int create_subvol(struct btrfs_root *root,
323 struct btrfs_inode_item *inode_item; 323 struct btrfs_inode_item *inode_item;
324 struct extent_buffer *leaf; 324 struct extent_buffer *leaf;
325 struct btrfs_root *new_root; 325 struct btrfs_root *new_root;
326 struct dentry *parent = dget_parent(dentry); 326 struct dentry *parent = dentry->d_parent;
327 struct inode *dir; 327 struct inode *dir;
328 int ret; 328 int ret;
329 int err; 329 int err;
@@ -332,10 +332,8 @@ static noinline int create_subvol(struct btrfs_root *root,
332 u64 index = 0; 332 u64 index = 0;
333 333
334 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); 334 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
335 if (ret) { 335 if (ret)
336 dput(parent);
337 return ret; 336 return ret;
338 }
339 337
340 dir = parent->d_inode; 338 dir = parent->d_inode;
341 339
@@ -346,10 +344,8 @@ static noinline int create_subvol(struct btrfs_root *root,
346 * 2 - dir items 344 * 2 - dir items
347 */ 345 */
348 trans = btrfs_start_transaction(root, 6); 346 trans = btrfs_start_transaction(root, 6);
349 if (IS_ERR(trans)) { 347 if (IS_ERR(trans))
350 dput(parent);
351 return PTR_ERR(trans); 348 return PTR_ERR(trans);
352 }
353 349
354 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 350 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
355 0, objectid, NULL, 0, 0, 0); 351 0, objectid, NULL, 0, 0, 0);
@@ -439,7 +435,6 @@ static noinline int create_subvol(struct btrfs_root *root,
439 435
440 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 436 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
441fail: 437fail:
442 dput(parent);
443 if (async_transid) { 438 if (async_transid) {
444 *async_transid = trans->transid; 439 *async_transid = trans->transid;
445 err = btrfs_commit_transaction_async(trans, root, 1); 440 err = btrfs_commit_transaction_async(trans, root, 1);
@@ -456,7 +451,6 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
456 bool readonly) 451 bool readonly)
457{ 452{
458 struct inode *inode; 453 struct inode *inode;
459 struct dentry *parent;
460 struct btrfs_pending_snapshot *pending_snapshot; 454 struct btrfs_pending_snapshot *pending_snapshot;
461 struct btrfs_trans_handle *trans; 455 struct btrfs_trans_handle *trans;
462 int ret; 456 int ret;
@@ -504,9 +498,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
504 if (ret) 498 if (ret)
505 goto fail; 499 goto fail;
506 500
507 parent = dget_parent(dentry); 501 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
508 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
509 dput(parent);
510 if (IS_ERR(inode)) { 502 if (IS_ERR(inode)) {
511 ret = PTR_ERR(inode); 503 ret = PTR_ERR(inode);
512 goto fail; 504 goto fail;
@@ -867,8 +859,8 @@ again:
867 /* step one, lock all the pages */ 859 /* step one, lock all the pages */
868 for (i = 0; i < num_pages; i++) { 860 for (i = 0; i < num_pages; i++) {
869 struct page *page; 861 struct page *page;
870 page = grab_cache_page(inode->i_mapping, 862 page = find_or_create_page(inode->i_mapping,
871 start_index + i); 863 start_index + i, GFP_NOFS);
872 if (!page) 864 if (!page)
873 break; 865 break;
874 866
@@ -938,7 +930,9 @@ again:
938 GFP_NOFS); 930 GFP_NOFS);
939 931
940 if (i_done != num_pages) { 932 if (i_done != num_pages) {
941 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 933 spin_lock(&BTRFS_I(inode)->lock);
934 BTRFS_I(inode)->outstanding_extents++;
935 spin_unlock(&BTRFS_I(inode)->lock);
942 btrfs_delalloc_release_space(inode, 936 btrfs_delalloc_release_space(inode,
943 (num_pages - i_done) << PAGE_CACHE_SHIFT); 937 (num_pages - i_done) << PAGE_CACHE_SHIFT);
944 } 938 }
@@ -1053,7 +1047,16 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1053 if (!max_to_defrag) 1047 if (!max_to_defrag)
1054 max_to_defrag = last_index - 1; 1048 max_to_defrag = last_index - 1;
1055 1049
1056 while (i <= last_index && defrag_count < max_to_defrag) { 1050 /*
1051 * make writeback starts from i, so the defrag range can be
1052 * written sequentially.
1053 */
1054 if (i < inode->i_mapping->writeback_index)
1055 inode->i_mapping->writeback_index = i;
1056
1057 while (i <= last_index && defrag_count < max_to_defrag &&
1058 (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
1059 PAGE_CACHE_SHIFT)) {
1057 /* 1060 /*
1058 * make sure we stop running if someone unmounts 1061 * make sure we stop running if someone unmounts
1059 * the FS 1062 * the FS
@@ -1755,11 +1758,10 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1755 key.objectid = key.offset; 1758 key.objectid = key.offset;
1756 key.offset = (u64)-1; 1759 key.offset = (u64)-1;
1757 dirid = key.objectid; 1760 dirid = key.objectid;
1758
1759 } 1761 }
1760 if (ptr < name) 1762 if (ptr < name)
1761 goto out; 1763 goto out;
1762 memcpy(name, ptr, total_len); 1764 memmove(name, ptr, total_len);
1763 name[total_len]='\0'; 1765 name[total_len]='\0';
1764 ret = 0; 1766 ret = 0;
1765out: 1767out:
@@ -2184,6 +2186,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2184 if (!(src_file->f_mode & FMODE_READ)) 2186 if (!(src_file->f_mode & FMODE_READ))
2185 goto out_fput; 2187 goto out_fput;
2186 2188
2189 /* don't make the dst file partly checksummed */
2190 if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
2191 (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
2192 goto out_fput;
2193
2187 ret = -EISDIR; 2194 ret = -EISDIR;
2188 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 2195 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
2189 goto out_fput; 2196 goto out_fput;
@@ -2227,6 +2234,16 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2227 !IS_ALIGNED(destoff, bs)) 2234 !IS_ALIGNED(destoff, bs))
2228 goto out_unlock; 2235 goto out_unlock;
2229 2236
2237 if (destoff > inode->i_size) {
2238 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
2239 if (ret)
2240 goto out_unlock;
2241 }
2242
2243 /* truncate page cache pages from target inode range */
2244 truncate_inode_pages_range(&inode->i_data, destoff,
2245 PAGE_CACHE_ALIGN(destoff + len) - 1);
2246
2230 /* do any pending delalloc/csum calc on src, one way or 2247 /* do any pending delalloc/csum calc on src, one way or
2231 another, and lock file content */ 2248 another, and lock file content */
2232 while (1) { 2249 while (1) {
@@ -2320,7 +2337,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2320 else 2337 else
2321 new_key.offset = destoff; 2338 new_key.offset = destoff;
2322 2339
2323 trans = btrfs_start_transaction(root, 1); 2340 /*
2341 * 1 - adjusting old extent (we may have to split it)
2342 * 1 - add new extent
2343 * 1 - inode update
2344 */
2345 trans = btrfs_start_transaction(root, 3);
2324 if (IS_ERR(trans)) { 2346 if (IS_ERR(trans)) {
2325 ret = PTR_ERR(trans); 2347 ret = PTR_ERR(trans);
2326 goto out; 2348 goto out;
@@ -2328,14 +2350,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2328 2350
2329 if (type == BTRFS_FILE_EXTENT_REG || 2351 if (type == BTRFS_FILE_EXTENT_REG ||
2330 type == BTRFS_FILE_EXTENT_PREALLOC) { 2352 type == BTRFS_FILE_EXTENT_PREALLOC) {
2353 /*
2354 * a | --- range to clone ---| b
2355 * | ------------- extent ------------- |
2356 */
2357
2358 /* substract range b */
2359 if (key.offset + datal > off + len)
2360 datal = off + len - key.offset;
2361
2362 /* substract range a */
2331 if (off > key.offset) { 2363 if (off > key.offset) {
2332 datao += off - key.offset; 2364 datao += off - key.offset;
2333 datal -= off - key.offset; 2365 datal -= off - key.offset;
2334 } 2366 }
2335 2367
2336 if (key.offset + datal > off + len)
2337 datal = off + len - key.offset;
2338
2339 ret = btrfs_drop_extents(trans, inode, 2368 ret = btrfs_drop_extents(trans, inode,
2340 new_key.offset, 2369 new_key.offset,
2341 new_key.offset + datal, 2370 new_key.offset + datal,
@@ -2432,7 +2461,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
2432 if (endoff > inode->i_size) 2461 if (endoff > inode->i_size)
2433 btrfs_i_size_write(inode, endoff); 2462 btrfs_i_size_write(inode, endoff);
2434 2463
2435 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
2436 ret = btrfs_update_inode(trans, root, inode); 2464 ret = btrfs_update_inode(trans, root, inode);
2437 BUG_ON(ret); 2465 BUG_ON(ret);
2438 btrfs_end_transaction(trans, root); 2466 btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 66fa43dc3f0..d77b67c4b27 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -24,185 +24,197 @@
24#include "extent_io.h" 24#include "extent_io.h"
25#include "locking.h" 25#include "locking.h"
26 26
27static inline void spin_nested(struct extent_buffer *eb) 27void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
28{
29 spin_lock(&eb->lock);
30}
31 28
32/* 29/*
33 * Setting a lock to blocking will drop the spinlock and set the 30 * if we currently have a spinning reader or writer lock
34 * flag that forces other procs who want the lock to wait. After 31 * (indicated by the rw flag) this will bump the count
35 * this you can safely schedule with the lock held. 32 * of blocking holders and drop the spinlock.
36 */ 33 */
37void btrfs_set_lock_blocking(struct extent_buffer *eb) 34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
38{ 35{
39 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 36 if (rw == BTRFS_WRITE_LOCK) {
40 set_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 37 if (atomic_read(&eb->blocking_writers) == 0) {
41 spin_unlock(&eb->lock); 38 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
39 atomic_dec(&eb->spinning_writers);
40 btrfs_assert_tree_locked(eb);
41 atomic_inc(&eb->blocking_writers);
42 write_unlock(&eb->lock);
43 }
44 } else if (rw == BTRFS_READ_LOCK) {
45 btrfs_assert_tree_read_locked(eb);
46 atomic_inc(&eb->blocking_readers);
47 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
48 atomic_dec(&eb->spinning_readers);
49 read_unlock(&eb->lock);
42 } 50 }
43 /* exit with the spin lock released and the bit set */ 51 return;
44} 52}
45 53
46/* 54/*
47 * clearing the blocking flag will take the spinlock again. 55 * if we currently have a blocking lock, take the spinlock
48 * After this you can't safely schedule 56 * and drop our blocking count
49 */ 57 */
50void btrfs_clear_lock_blocking(struct extent_buffer *eb) 58void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
51{ 59{
52 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) { 60 if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
53 spin_nested(eb); 61 BUG_ON(atomic_read(&eb->blocking_writers) != 1);
54 clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags); 62 write_lock(&eb->lock);
55 smp_mb__after_clear_bit(); 63 WARN_ON(atomic_read(&eb->spinning_writers));
64 atomic_inc(&eb->spinning_writers);
65 if (atomic_dec_and_test(&eb->blocking_writers))
66 wake_up(&eb->write_lock_wq);
67 } else if (rw == BTRFS_READ_LOCK_BLOCKING) {
68 BUG_ON(atomic_read(&eb->blocking_readers) == 0);
69 read_lock(&eb->lock);
70 atomic_inc(&eb->spinning_readers);
71 if (atomic_dec_and_test(&eb->blocking_readers))
72 wake_up(&eb->read_lock_wq);
56 } 73 }
57 /* exit with the spin lock held */ 74 return;
58} 75}
59 76
60/* 77/*
61 * unfortunately, many of the places that currently set a lock to blocking 78 * take a spinning read lock. This will wait for any blocking
62 * don't end up blocking for very long, and often they don't block 79 * writers
63 * at all. For a dbench 50 run, if we don't spin on the blocking bit
64 * at all, the context switch rate can jump up to 400,000/sec or more.
65 *
66 * So, we're still stuck with this crummy spin on the blocking bit,
67 * at least until the most common causes of the short blocks
68 * can be dealt with.
69 */ 80 */
70static int btrfs_spin_on_block(struct extent_buffer *eb) 81void btrfs_tree_read_lock(struct extent_buffer *eb)
71{ 82{
72 int i; 83again:
73 84 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
74 for (i = 0; i < 512; i++) { 85 read_lock(&eb->lock);
75 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 86 if (atomic_read(&eb->blocking_writers)) {
76 return 1; 87 read_unlock(&eb->lock);
77 if (need_resched()) 88 wait_event(eb->write_lock_wq,
78 break; 89 atomic_read(&eb->blocking_writers) == 0);
79 cpu_relax(); 90 goto again;
80 } 91 }
81 return 0; 92 atomic_inc(&eb->read_locks);
93 atomic_inc(&eb->spinning_readers);
82} 94}
83 95
84/* 96/*
85 * This is somewhat different from trylock. It will take the 97 * returns 1 if we get the read lock and 0 if we don't
86 * spinlock but if it finds the lock is set to blocking, it will 98 * this won't wait for blocking writers
87 * return without the lock held.
88 *
89 * returns 1 if it was able to take the lock and zero otherwise
90 *
91 * After this call, scheduling is not safe without first calling
92 * btrfs_set_lock_blocking()
93 */ 99 */
94int btrfs_try_spin_lock(struct extent_buffer *eb) 100int btrfs_try_tree_read_lock(struct extent_buffer *eb)
95{ 101{
96 int i; 102 if (atomic_read(&eb->blocking_writers))
103 return 0;
97 104
98 if (btrfs_spin_on_block(eb)) { 105 read_lock(&eb->lock);
99 spin_nested(eb); 106 if (atomic_read(&eb->blocking_writers)) {
100 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 107 read_unlock(&eb->lock);
101 return 1; 108 return 0;
102 spin_unlock(&eb->lock);
103 } 109 }
104 /* spin for a bit on the BLOCKING flag */ 110 atomic_inc(&eb->read_locks);
105 for (i = 0; i < 2; i++) { 111 atomic_inc(&eb->spinning_readers);
106 cpu_relax(); 112 return 1;
107 if (!btrfs_spin_on_block(eb))
108 break;
109
110 spin_nested(eb);
111 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
112 return 1;
113 spin_unlock(&eb->lock);
114 }
115 return 0;
116} 113}
117 114
118/* 115/*
119 * the autoremove wake function will return 0 if it tried to wake up 116 * returns 1 if we get the read lock and 0 if we don't
120 * a process that was already awake, which means that process won't 117 * this won't wait for blocking writers or readers
121 * count as an exclusive wakeup. The waitq code will continue waking
122 * procs until it finds one that was actually sleeping.
123 *
124 * For btrfs, this isn't quite what we want. We want a single proc
125 * to be notified that the lock is ready for taking. If that proc
126 * already happen to be awake, great, it will loop around and try for
127 * the lock.
128 *
129 * So, btrfs_wake_function always returns 1, even when the proc that we
130 * tried to wake up was already awake.
131 */ 118 */
132static int btrfs_wake_function(wait_queue_t *wait, unsigned mode, 119int btrfs_try_tree_write_lock(struct extent_buffer *eb)
133 int sync, void *key)
134{ 120{
135 autoremove_wake_function(wait, mode, sync, key); 121 if (atomic_read(&eb->blocking_writers) ||
122 atomic_read(&eb->blocking_readers))
123 return 0;
124 write_lock(&eb->lock);
125 if (atomic_read(&eb->blocking_writers) ||
126 atomic_read(&eb->blocking_readers)) {
127 write_unlock(&eb->lock);
128 return 0;
129 }
130 atomic_inc(&eb->write_locks);
131 atomic_inc(&eb->spinning_writers);
136 return 1; 132 return 1;
137} 133}
138 134
139/* 135/*
140 * returns with the extent buffer spinlocked. 136 * drop a spinning read lock
141 * 137 */
142 * This will spin and/or wait as required to take the lock, and then 138void btrfs_tree_read_unlock(struct extent_buffer *eb)
143 * return with the spinlock held. 139{
144 * 140 btrfs_assert_tree_read_locked(eb);
145 * After this call, scheduling is not safe without first calling 141 WARN_ON(atomic_read(&eb->spinning_readers) == 0);
146 * btrfs_set_lock_blocking() 142 atomic_dec(&eb->spinning_readers);
143 atomic_dec(&eb->read_locks);
144 read_unlock(&eb->lock);
145}
146
147/*
148 * drop a blocking read lock
149 */
150void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
151{
152 btrfs_assert_tree_read_locked(eb);
153 WARN_ON(atomic_read(&eb->blocking_readers) == 0);
154 if (atomic_dec_and_test(&eb->blocking_readers))
155 wake_up(&eb->read_lock_wq);
156 atomic_dec(&eb->read_locks);
157}
158
159/*
160 * take a spinning write lock. This will wait for both
161 * blocking readers or writers
147 */ 162 */
148int btrfs_tree_lock(struct extent_buffer *eb) 163int btrfs_tree_lock(struct extent_buffer *eb)
149{ 164{
150 DEFINE_WAIT(wait); 165again:
151 wait.func = btrfs_wake_function; 166 wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
152 167 wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
153 if (!btrfs_spin_on_block(eb)) 168 write_lock(&eb->lock);
154 goto sleep; 169 if (atomic_read(&eb->blocking_readers)) {
155 170 write_unlock(&eb->lock);
156 while(1) { 171 wait_event(eb->read_lock_wq,
157 spin_nested(eb); 172 atomic_read(&eb->blocking_readers) == 0);
158 173 goto again;
159 /* nobody is blocking, exit with the spinlock held */
160 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
161 return 0;
162
163 /*
164 * we have the spinlock, but the real owner is blocking.
165 * wait for them
166 */
167 spin_unlock(&eb->lock);
168
169 /*
170 * spin for a bit, and if the blocking flag goes away,
171 * loop around
172 */
173 cpu_relax();
174 if (btrfs_spin_on_block(eb))
175 continue;
176sleep:
177 prepare_to_wait_exclusive(&eb->lock_wq, &wait,
178 TASK_UNINTERRUPTIBLE);
179
180 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
181 schedule();
182
183 finish_wait(&eb->lock_wq, &wait);
184 } 174 }
175 if (atomic_read(&eb->blocking_writers)) {
176 write_unlock(&eb->lock);
177 wait_event(eb->write_lock_wq,
178 atomic_read(&eb->blocking_writers) == 0);
179 goto again;
180 }
181 WARN_ON(atomic_read(&eb->spinning_writers));
182 atomic_inc(&eb->spinning_writers);
183 atomic_inc(&eb->write_locks);
185 return 0; 184 return 0;
186} 185}
187 186
187/*
188 * drop a spinning or a blocking write lock.
189 */
188int btrfs_tree_unlock(struct extent_buffer *eb) 190int btrfs_tree_unlock(struct extent_buffer *eb)
189{ 191{
190 /* 192 int blockers = atomic_read(&eb->blocking_writers);
191 * if we were a blocking owner, we don't have the spinlock held 193
192 * just clear the bit and look for waiters 194 BUG_ON(blockers > 1);
193 */ 195
194 if (test_and_clear_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 196 btrfs_assert_tree_locked(eb);
195 smp_mb__after_clear_bit(); 197 atomic_dec(&eb->write_locks);
196 else 198
197 spin_unlock(&eb->lock); 199 if (blockers) {
198 200 WARN_ON(atomic_read(&eb->spinning_writers));
199 if (waitqueue_active(&eb->lock_wq)) 201 atomic_dec(&eb->blocking_writers);
200 wake_up(&eb->lock_wq); 202 smp_wmb();
203 wake_up(&eb->write_lock_wq);
204 } else {
205 WARN_ON(atomic_read(&eb->spinning_writers) != 1);
206 atomic_dec(&eb->spinning_writers);
207 write_unlock(&eb->lock);
208 }
201 return 0; 209 return 0;
202} 210}
203 211
204void btrfs_assert_tree_locked(struct extent_buffer *eb) 212void btrfs_assert_tree_locked(struct extent_buffer *eb)
205{ 213{
206 if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) 214 BUG_ON(!atomic_read(&eb->write_locks));
207 assert_spin_locked(&eb->lock); 215}
216
217void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
218{
219 BUG_ON(!atomic_read(&eb->read_locks));
208} 220}
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 5c33a560a2f..17247ddb81a 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -19,11 +19,43 @@
19#ifndef __BTRFS_LOCKING_ 19#ifndef __BTRFS_LOCKING_
20#define __BTRFS_LOCKING_ 20#define __BTRFS_LOCKING_
21 21
22#define BTRFS_WRITE_LOCK 1
23#define BTRFS_READ_LOCK 2
24#define BTRFS_WRITE_LOCK_BLOCKING 3
25#define BTRFS_READ_LOCK_BLOCKING 4
26
22int btrfs_tree_lock(struct extent_buffer *eb); 27int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 28int btrfs_tree_unlock(struct extent_buffer *eb);
24int btrfs_try_spin_lock(struct extent_buffer *eb); 29int btrfs_try_spin_lock(struct extent_buffer *eb);
25 30
26void btrfs_set_lock_blocking(struct extent_buffer *eb); 31void btrfs_tree_read_lock(struct extent_buffer *eb);
27void btrfs_clear_lock_blocking(struct extent_buffer *eb); 32void btrfs_tree_read_unlock(struct extent_buffer *eb);
33void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb);
34void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw);
35void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw);
28void btrfs_assert_tree_locked(struct extent_buffer *eb); 36void btrfs_assert_tree_locked(struct extent_buffer *eb);
37int btrfs_try_tree_read_lock(struct extent_buffer *eb);
38int btrfs_try_tree_write_lock(struct extent_buffer *eb);
39
40static inline void btrfs_tree_unlock_rw(struct extent_buffer *eb, int rw)
41{
42 if (rw == BTRFS_WRITE_LOCK || rw == BTRFS_WRITE_LOCK_BLOCKING)
43 btrfs_tree_unlock(eb);
44 else if (rw == BTRFS_READ_LOCK_BLOCKING)
45 btrfs_tree_read_unlock_blocking(eb);
46 else if (rw == BTRFS_READ_LOCK)
47 btrfs_tree_read_unlock(eb);
48 else
49 BUG();
50}
51
52static inline void btrfs_set_lock_blocking(struct extent_buffer *eb)
53{
54 btrfs_set_lock_blocking_rw(eb, BTRFS_WRITE_LOCK);
55}
56
57static inline void btrfs_clear_lock_blocking(struct extent_buffer *eb)
58{
59 btrfs_clear_lock_blocking_rw(eb, BTRFS_WRITE_LOCK_BLOCKING);
60}
29#endif 61#endif
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
deleted file mode 100644
index 82d569cb626..00000000000
--- a/fs/btrfs/ref-cache.c
+++ /dev/null
@@ -1,68 +0,0 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/slab.h>
21#include <linux/sort.h>
22#include "ctree.h"
23#include "ref-cache.h"
24#include "transaction.h"
25
26static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
27 struct rb_node *node)
28{
29 struct rb_node **p = &root->rb_node;
30 struct rb_node *parent = NULL;
31 struct btrfs_leaf_ref *entry;
32
33 while (*p) {
34 parent = *p;
35 entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node);
36
37 if (bytenr < entry->bytenr)
38 p = &(*p)->rb_left;
39 else if (bytenr > entry->bytenr)
40 p = &(*p)->rb_right;
41 else
42 return parent;
43 }
44
45 entry = rb_entry(node, struct btrfs_leaf_ref, rb_node);
46 rb_link_node(node, parent, p);
47 rb_insert_color(node, root);
48 return NULL;
49}
50
51static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
52{
53 struct rb_node *n = root->rb_node;
54 struct btrfs_leaf_ref *entry;
55
56 while (n) {
57 entry = rb_entry(n, struct btrfs_leaf_ref, rb_node);
58 WARN_ON(!entry->in_tree);
59
60 if (bytenr < entry->bytenr)
61 n = n->rb_left;
62 else if (bytenr > entry->bytenr)
63 n = n->rb_right;
64 else
65 return n;
66 }
67 return NULL;
68}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
deleted file mode 100644
index 24f7001f638..00000000000
--- a/fs/btrfs/ref-cache.h
+++ /dev/null
@@ -1,52 +0,0 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18#ifndef __REFCACHE__
19#define __REFCACHE__
20
21struct btrfs_extent_info {
22 /* bytenr and num_bytes find the extent in the extent allocation tree */
23 u64 bytenr;
24 u64 num_bytes;
25
26 /* objectid and offset find the back reference for the file */
27 u64 objectid;
28 u64 offset;
29};
30
31struct btrfs_leaf_ref {
32 struct rb_node rb_node;
33 struct btrfs_leaf_ref_tree *tree;
34 int in_tree;
35 atomic_t usage;
36
37 u64 root_gen;
38 u64 bytenr;
39 u64 owner;
40 u64 generation;
41 int nritems;
42
43 struct list_head list;
44 struct btrfs_extent_info extents[];
45};
46
47static inline size_t btrfs_leaf_ref_size(int nr_extents)
48{
49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents;
51}
52#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 5e0a3dc79a4..59bb1764273 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2955,7 +2955,8 @@ static int relocate_file_extent_cluster(struct inode *inode,
2955 page_cache_sync_readahead(inode->i_mapping, 2955 page_cache_sync_readahead(inode->i_mapping,
2956 ra, NULL, index, 2956 ra, NULL, index,
2957 last_index + 1 - index); 2957 last_index + 1 - index);
2958 page = grab_cache_page(inode->i_mapping, index); 2958 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS);
2959 if (!page) { 2960 if (!page) {
2960 btrfs_delalloc_release_metadata(inode, 2961 btrfs_delalloc_release_metadata(inode,
2961 PAGE_CACHE_SIZE); 2962 PAGE_CACHE_SIZE);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ebe45443de0..f4099904565 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -71,13 +71,12 @@ out:
71 return ret; 71 return ret;
72} 72}
73 73
74int btrfs_set_root_node(struct btrfs_root_item *item, 74void btrfs_set_root_node(struct btrfs_root_item *item,
75 struct extent_buffer *node) 75 struct extent_buffer *node)
76{ 76{
77 btrfs_set_root_bytenr(item, node->start); 77 btrfs_set_root_bytenr(item, node->start);
78 btrfs_set_root_level(item, btrfs_header_level(node)); 78 btrfs_set_root_level(item, btrfs_header_level(node));
79 btrfs_set_root_generation(item, btrfs_header_generation(node)); 79 btrfs_set_root_generation(item, btrfs_header_generation(node));
80 return 0;
81} 80}
82 81
83/* 82/*
diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
index c0f7ecaf1e7..bc1f6ad1844 100644
--- a/fs/btrfs/struct-funcs.c
+++ b/fs/btrfs/struct-funcs.c
@@ -50,36 +50,22 @@ u##bits btrfs_##name(struct extent_buffer *eb, \
50 unsigned long part_offset = (unsigned long)s; \ 50 unsigned long part_offset = (unsigned long)s; \
51 unsigned long offset = part_offset + offsetof(type, member); \ 51 unsigned long offset = part_offset + offsetof(type, member); \
52 type *p; \ 52 type *p; \
53 /* ugly, but we want the fast path here */ \ 53 int err; \
54 if (eb->map_token && offset >= eb->map_start && \ 54 char *kaddr; \
55 offset + sizeof(((type *)0)->member) <= eb->map_start + \ 55 unsigned long map_start; \
56 eb->map_len) { \ 56 unsigned long map_len; \
57 p = (type *)(eb->kaddr + part_offset - eb->map_start); \ 57 u##bits res; \
58 return le##bits##_to_cpu(p->member); \ 58 err = map_private_extent_buffer(eb, offset, \
59 } \ 59 sizeof(((type *)0)->member), \
60 { \ 60 &kaddr, &map_start, &map_len); \
61 int err; \ 61 if (err) { \
62 char *map_token; \ 62 __le##bits leres; \
63 char *kaddr; \ 63 read_eb_member(eb, s, type, member, &leres); \
64 int unmap_on_exit = (eb->map_token == NULL); \ 64 return le##bits##_to_cpu(leres); \
65 unsigned long map_start; \ 65 } \
66 unsigned long map_len; \ 66 p = (type *)(kaddr + part_offset - map_start); \
67 u##bits res; \ 67 res = le##bits##_to_cpu(p->member); \
68 err = map_extent_buffer(eb, offset, \ 68 return res; \
69 sizeof(((type *)0)->member), \
70 &map_token, &kaddr, \
71 &map_start, &map_len, KM_USER1); \
72 if (err) { \
73 __le##bits leres; \
74 read_eb_member(eb, s, type, member, &leres); \
75 return le##bits##_to_cpu(leres); \
76 } \
77 p = (type *)(kaddr + part_offset - map_start); \
78 res = le##bits##_to_cpu(p->member); \
79 if (unmap_on_exit) \
80 unmap_extent_buffer(eb, map_token, KM_USER1); \
81 return res; \
82 } \
83} \ 69} \
84void btrfs_set_##name(struct extent_buffer *eb, \ 70void btrfs_set_##name(struct extent_buffer *eb, \
85 type *s, u##bits val) \ 71 type *s, u##bits val) \
@@ -87,36 +73,21 @@ void btrfs_set_##name(struct extent_buffer *eb, \
87 unsigned long part_offset = (unsigned long)s; \ 73 unsigned long part_offset = (unsigned long)s; \
88 unsigned long offset = part_offset + offsetof(type, member); \ 74 unsigned long offset = part_offset + offsetof(type, member); \
89 type *p; \ 75 type *p; \
90 /* ugly, but we want the fast path here */ \ 76 int err; \
91 if (eb->map_token && offset >= eb->map_start && \ 77 char *kaddr; \
92 offset + sizeof(((type *)0)->member) <= eb->map_start + \ 78 unsigned long map_start; \
93 eb->map_len) { \ 79 unsigned long map_len; \
94 p = (type *)(eb->kaddr + part_offset - eb->map_start); \ 80 err = map_private_extent_buffer(eb, offset, \
95 p->member = cpu_to_le##bits(val); \ 81 sizeof(((type *)0)->member), \
96 return; \ 82 &kaddr, &map_start, &map_len); \
97 } \ 83 if (err) { \
98 { \ 84 __le##bits val2; \
99 int err; \ 85 val2 = cpu_to_le##bits(val); \
100 char *map_token; \ 86 write_eb_member(eb, s, type, member, &val2); \
101 char *kaddr; \ 87 return; \
102 int unmap_on_exit = (eb->map_token == NULL); \ 88 } \
103 unsigned long map_start; \ 89 p = (type *)(kaddr + part_offset - map_start); \
104 unsigned long map_len; \ 90 p->member = cpu_to_le##bits(val); \
105 err = map_extent_buffer(eb, offset, \
106 sizeof(((type *)0)->member), \
107 &map_token, &kaddr, \
108 &map_start, &map_len, KM_USER1); \
109 if (err) { \
110 __le##bits val2; \
111 val2 = cpu_to_le##bits(val); \
112 write_eb_member(eb, s, type, member, &val2); \
113 return; \
114 } \
115 p = (type *)(kaddr + part_offset - map_start); \
116 p->member = cpu_to_le##bits(val); \
117 if (unmap_on_exit) \
118 unmap_extent_buffer(eb, map_token, KM_USER1); \
119 } \
120} 91}
121 92
122#include "ctree.h" 93#include "ctree.h"
@@ -125,15 +96,6 @@ void btrfs_node_key(struct extent_buffer *eb,
125 struct btrfs_disk_key *disk_key, int nr) 96 struct btrfs_disk_key *disk_key, int nr)
126{ 97{
127 unsigned long ptr = btrfs_node_key_ptr_offset(nr); 98 unsigned long ptr = btrfs_node_key_ptr_offset(nr);
128 if (eb->map_token && ptr >= eb->map_start &&
129 ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) {
130 memcpy(disk_key, eb->kaddr + ptr - eb->map_start,
131 sizeof(*disk_key));
132 return;
133 } else if (eb->map_token) {
134 unmap_extent_buffer(eb, eb->map_token, KM_USER1);
135 eb->map_token = NULL;
136 }
137 read_eb_member(eb, (struct btrfs_key_ptr *)ptr, 99 read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
138 struct btrfs_key_ptr, key, disk_key); 100 struct btrfs_key_ptr, key, disk_key);
139} 101}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 51dcec86757..e24b7964a15 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -216,17 +216,11 @@ static void wait_current_trans(struct btrfs_root *root)
216 spin_lock(&root->fs_info->trans_lock); 216 spin_lock(&root->fs_info->trans_lock);
217 cur_trans = root->fs_info->running_transaction; 217 cur_trans = root->fs_info->running_transaction;
218 if (cur_trans && cur_trans->blocked) { 218 if (cur_trans && cur_trans->blocked) {
219 DEFINE_WAIT(wait);
220 atomic_inc(&cur_trans->use_count); 219 atomic_inc(&cur_trans->use_count);
221 spin_unlock(&root->fs_info->trans_lock); 220 spin_unlock(&root->fs_info->trans_lock);
222 while (1) { 221
223 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 222 wait_event(root->fs_info->transaction_wait,
224 TASK_UNINTERRUPTIBLE); 223 !cur_trans->blocked);
225 if (!cur_trans->blocked)
226 break;
227 schedule();
228 }
229 finish_wait(&root->fs_info->transaction_wait, &wait);
230 put_transaction(cur_trans); 224 put_transaction(cur_trans);
231 } else { 225 } else {
232 spin_unlock(&root->fs_info->trans_lock); 226 spin_unlock(&root->fs_info->trans_lock);
@@ -260,7 +254,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
260{ 254{
261 struct btrfs_trans_handle *h; 255 struct btrfs_trans_handle *h;
262 struct btrfs_transaction *cur_trans; 256 struct btrfs_transaction *cur_trans;
263 int retries = 0; 257 u64 num_bytes = 0;
264 int ret; 258 int ret;
265 259
266 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) 260 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
@@ -274,6 +268,19 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
274 h->block_rsv = NULL; 268 h->block_rsv = NULL;
275 goto got_it; 269 goto got_it;
276 } 270 }
271
272 /*
273 * Do the reservation before we join the transaction so we can do all
274 * the appropriate flushing if need be.
275 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root,
279 &root->fs_info->trans_block_rsv,
280 num_bytes);
281 if (ret)
282 return ERR_PTR(ret);
283 }
277again: 284again:
278 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 285 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
279 if (!h) 286 if (!h)
@@ -310,24 +317,9 @@ again:
310 goto again; 317 goto again;
311 } 318 }
312 319
313 if (num_items > 0) { 320 if (num_bytes) {
314 ret = btrfs_trans_reserve_metadata(h, root, num_items); 321 h->block_rsv = &root->fs_info->trans_block_rsv;
315 if (ret == -EAGAIN && !retries) { 322 h->bytes_reserved = num_bytes;
316 retries++;
317 btrfs_commit_transaction(h, root);
318 goto again;
319 } else if (ret == -EAGAIN) {
320 /*
321 * We have already retried and got EAGAIN, so really we
322 * don't have space, so set ret to -ENOSPC.
323 */
324 ret = -ENOSPC;
325 }
326
327 if (ret < 0) {
328 btrfs_end_transaction(h, root);
329 return ERR_PTR(ret);
330 }
331 } 323 }
332 324
333got_it: 325got_it:
@@ -359,19 +351,10 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
359} 351}
360 352
361/* wait for a transaction commit to be fully complete */ 353/* wait for a transaction commit to be fully complete */
362static noinline int wait_for_commit(struct btrfs_root *root, 354static noinline void wait_for_commit(struct btrfs_root *root,
363 struct btrfs_transaction *commit) 355 struct btrfs_transaction *commit)
364{ 356{
365 DEFINE_WAIT(wait); 357 wait_event(commit->commit_wait, commit->commit_done);
366 while (!commit->commit_done) {
367 prepare_to_wait(&commit->commit_wait, &wait,
368 TASK_UNINTERRUPTIBLE);
369 if (commit->commit_done)
370 break;
371 schedule();
372 }
373 finish_wait(&commit->commit_wait, &wait);
374 return 0;
375} 358}
376 359
377int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) 360int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
@@ -499,10 +482,17 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
499 } 482 }
500 483
501 if (lock && cur_trans->blocked && !cur_trans->in_commit) { 484 if (lock && cur_trans->blocked && !cur_trans->in_commit) {
502 if (throttle) 485 if (throttle) {
486 /*
487 * We may race with somebody else here so end up having
488 * to call end_transaction on ourselves again, so inc
489 * our use_count.
490 */
491 trans->use_count++;
503 return btrfs_commit_transaction(trans, root); 492 return btrfs_commit_transaction(trans, root);
504 else 493 } else {
505 wake_up_process(info->transaction_kthread); 494 wake_up_process(info->transaction_kthread);
495 }
506 } 496 }
507 497
508 WARN_ON(cur_trans != info->running_transaction); 498 WARN_ON(cur_trans != info->running_transaction);
@@ -894,6 +884,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
894 struct btrfs_root *tree_root = fs_info->tree_root; 884 struct btrfs_root *tree_root = fs_info->tree_root;
895 struct btrfs_root *root = pending->root; 885 struct btrfs_root *root = pending->root;
896 struct btrfs_root *parent_root; 886 struct btrfs_root *parent_root;
887 struct btrfs_block_rsv *rsv;
897 struct inode *parent_inode; 888 struct inode *parent_inode;
898 struct dentry *parent; 889 struct dentry *parent;
899 struct dentry *dentry; 890 struct dentry *dentry;
@@ -905,6 +896,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
905 u64 objectid; 896 u64 objectid;
906 u64 root_flags; 897 u64 root_flags;
907 898
899 rsv = trans->block_rsv;
900
908 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 901 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
909 if (!new_root_item) { 902 if (!new_root_item) {
910 pending->error = -ENOMEM; 903 pending->error = -ENOMEM;
@@ -1012,6 +1005,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1012 btrfs_orphan_post_snapshot(trans, pending); 1005 btrfs_orphan_post_snapshot(trans, pending);
1013fail: 1006fail:
1014 kfree(new_root_item); 1007 kfree(new_root_item);
1008 trans->block_rsv = rsv;
1015 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1); 1009 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1016 return 0; 1010 return 0;
1017} 1011}
@@ -1080,22 +1074,7 @@ int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1080static void wait_current_trans_commit_start(struct btrfs_root *root, 1074static void wait_current_trans_commit_start(struct btrfs_root *root,
1081 struct btrfs_transaction *trans) 1075 struct btrfs_transaction *trans)
1082{ 1076{
1083 DEFINE_WAIT(wait); 1077 wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
1084
1085 if (trans->in_commit)
1086 return;
1087
1088 while (1) {
1089 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
1090 TASK_UNINTERRUPTIBLE);
1091 if (trans->in_commit) {
1092 finish_wait(&root->fs_info->transaction_blocked_wait,
1093 &wait);
1094 break;
1095 }
1096 schedule();
1097 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1098 }
1099} 1078}
1100 1079
1101/* 1080/*
@@ -1105,24 +1084,8 @@ static void wait_current_trans_commit_start(struct btrfs_root *root,
1105static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root, 1084static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1106 struct btrfs_transaction *trans) 1085 struct btrfs_transaction *trans)
1107{ 1086{
1108 DEFINE_WAIT(wait); 1087 wait_event(root->fs_info->transaction_wait,
1109 1088 trans->commit_done || (trans->in_commit && !trans->blocked));
1110 if (trans->commit_done || (trans->in_commit && !trans->blocked))
1111 return;
1112
1113 while (1) {
1114 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
1115 TASK_UNINTERRUPTIBLE);
1116 if (trans->commit_done ||
1117 (trans->in_commit && !trans->blocked)) {
1118 finish_wait(&root->fs_info->transaction_wait,
1119 &wait);
1120 break;
1121 }
1122 schedule();
1123 finish_wait(&root->fs_info->transaction_wait,
1124 &wait);
1125 }
1126} 1089}
1127 1090
1128/* 1091/*
@@ -1229,8 +1192,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1229 atomic_inc(&cur_trans->use_count); 1192 atomic_inc(&cur_trans->use_count);
1230 btrfs_end_transaction(trans, root); 1193 btrfs_end_transaction(trans, root);
1231 1194
1232 ret = wait_for_commit(root, cur_trans); 1195 wait_for_commit(root, cur_trans);
1233 BUG_ON(ret);
1234 1196
1235 put_transaction(cur_trans); 1197 put_transaction(cur_trans);
1236 1198
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4ce8a9f41d1..786639fca06 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -799,14 +799,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
799 struct extent_buffer *eb, int slot, 799 struct extent_buffer *eb, int slot,
800 struct btrfs_key *key) 800 struct btrfs_key *key)
801{ 801{
802 struct inode *dir;
803 int ret;
804 struct btrfs_inode_ref *ref; 802 struct btrfs_inode_ref *ref;
803 struct btrfs_dir_item *di;
804 struct inode *dir;
805 struct inode *inode; 805 struct inode *inode;
806 char *name;
807 int namelen;
808 unsigned long ref_ptr; 806 unsigned long ref_ptr;
809 unsigned long ref_end; 807 unsigned long ref_end;
808 char *name;
809 int namelen;
810 int ret;
810 int search_done = 0; 811 int search_done = 0;
811 812
812 /* 813 /*
@@ -909,6 +910,25 @@ again:
909 } 910 }
910 btrfs_release_path(path); 911 btrfs_release_path(path);
911 912
913 /* look for a conflicting sequence number */
914 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
915 btrfs_inode_ref_index(eb, ref),
916 name, namelen, 0);
917 if (di && !IS_ERR(di)) {
918 ret = drop_one_dir_item(trans, root, path, dir, di);
919 BUG_ON(ret);
920 }
921 btrfs_release_path(path);
922
923 /* look for a conflicing name */
924 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
925 name, namelen, 0);
926 if (di && !IS_ERR(di)) {
927 ret = drop_one_dir_item(trans, root, path, dir, di);
928 BUG_ON(ret);
929 }
930 btrfs_release_path(path);
931
912insert: 932insert:
913 /* insert our name */ 933 /* insert our name */
914 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 934 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
@@ -1617,7 +1637,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1617 return 0; 1637 return 0;
1618 1638
1619 path = btrfs_alloc_path(); 1639 path = btrfs_alloc_path();
1620 BUG_ON(!path); 1640 if (!path)
1641 return -ENOMEM;
1621 1642
1622 nritems = btrfs_header_nritems(eb); 1643 nritems = btrfs_header_nritems(eb);
1623 for (i = 0; i < nritems; i++) { 1644 for (i = 0; i < nritems; i++) {
@@ -1723,15 +1744,17 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1723 return -ENOMEM; 1744 return -ENOMEM;
1724 1745
1725 if (*level == 1) { 1746 if (*level == 1) {
1726 wc->process_func(root, next, wc, ptr_gen); 1747 ret = wc->process_func(root, next, wc, ptr_gen);
1748 if (ret)
1749 return ret;
1727 1750
1728 path->slots[*level]++; 1751 path->slots[*level]++;
1729 if (wc->free) { 1752 if (wc->free) {
1730 btrfs_read_buffer(next, ptr_gen); 1753 btrfs_read_buffer(next, ptr_gen);
1731 1754
1732 btrfs_tree_lock(next); 1755 btrfs_tree_lock(next);
1733 clean_tree_block(trans, root, next);
1734 btrfs_set_lock_blocking(next); 1756 btrfs_set_lock_blocking(next);
1757 clean_tree_block(trans, root, next);
1735 btrfs_wait_tree_block_writeback(next); 1758 btrfs_wait_tree_block_writeback(next);
1736 btrfs_tree_unlock(next); 1759 btrfs_tree_unlock(next);
1737 1760
@@ -1788,16 +1811,19 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1788 parent = path->nodes[*level + 1]; 1811 parent = path->nodes[*level + 1];
1789 1812
1790 root_owner = btrfs_header_owner(parent); 1813 root_owner = btrfs_header_owner(parent);
1791 wc->process_func(root, path->nodes[*level], wc, 1814 ret = wc->process_func(root, path->nodes[*level], wc,
1792 btrfs_header_generation(path->nodes[*level])); 1815 btrfs_header_generation(path->nodes[*level]));
1816 if (ret)
1817 return ret;
1818
1793 if (wc->free) { 1819 if (wc->free) {
1794 struct extent_buffer *next; 1820 struct extent_buffer *next;
1795 1821
1796 next = path->nodes[*level]; 1822 next = path->nodes[*level];
1797 1823
1798 btrfs_tree_lock(next); 1824 btrfs_tree_lock(next);
1799 clean_tree_block(trans, root, next);
1800 btrfs_set_lock_blocking(next); 1825 btrfs_set_lock_blocking(next);
1826 clean_tree_block(trans, root, next);
1801 btrfs_wait_tree_block_writeback(next); 1827 btrfs_wait_tree_block_writeback(next);
1802 btrfs_tree_unlock(next); 1828 btrfs_tree_unlock(next);
1803 1829
@@ -1864,8 +1890,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1864 next = path->nodes[orig_level]; 1890 next = path->nodes[orig_level];
1865 1891
1866 btrfs_tree_lock(next); 1892 btrfs_tree_lock(next);
1867 clean_tree_block(trans, log, next);
1868 btrfs_set_lock_blocking(next); 1893 btrfs_set_lock_blocking(next);
1894 clean_tree_block(trans, log, next);
1869 btrfs_wait_tree_block_writeback(next); 1895 btrfs_wait_tree_block_writeback(next);
1870 btrfs_tree_unlock(next); 1896 btrfs_tree_unlock(next);
1871 1897
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 19450bc5363..f2a4cc79da6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -142,6 +142,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
142 unsigned long limit; 142 unsigned long limit;
143 unsigned long last_waited = 0; 143 unsigned long last_waited = 0;
144 int force_reg = 0; 144 int force_reg = 0;
145 int sync_pending = 0;
145 struct blk_plug plug; 146 struct blk_plug plug;
146 147
147 /* 148 /*
@@ -229,6 +230,22 @@ loop_lock:
229 230
230 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 231 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
231 232
233 /*
234 * if we're doing the sync list, record that our
235 * plug has some sync requests on it
236 *
237 * If we're doing the regular list and there are
238 * sync requests sitting around, unplug before
239 * we add more
240 */
241 if (pending_bios == &device->pending_sync_bios) {
242 sync_pending = 1;
243 } else if (sync_pending) {
244 blk_finish_plug(&plug);
245 blk_start_plug(&plug);
246 sync_pending = 0;
247 }
248
232 submit_bio(cur->bi_rw, cur); 249 submit_bio(cur->bi_rw, cur);
233 num_run++; 250 num_run++;
234 batch_run++; 251 batch_run++;
@@ -500,6 +517,9 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
500 fs_devices->rw_devices--; 517 fs_devices->rw_devices--;
501 } 518 }
502 519
520 if (device->can_discard)
521 fs_devices->num_can_discard--;
522
503 new_device = kmalloc(sizeof(*new_device), GFP_NOFS); 523 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
504 BUG_ON(!new_device); 524 BUG_ON(!new_device);
505 memcpy(new_device, device, sizeof(*new_device)); 525 memcpy(new_device, device, sizeof(*new_device));
@@ -508,6 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
508 new_device->bdev = NULL; 528 new_device->bdev = NULL;
509 new_device->writeable = 0; 529 new_device->writeable = 0;
510 new_device->in_fs_metadata = 0; 530 new_device->in_fs_metadata = 0;
531 new_device->can_discard = 0;
511 list_replace_rcu(&device->dev_list, &new_device->dev_list); 532 list_replace_rcu(&device->dev_list, &new_device->dev_list);
512 533
513 call_rcu(&device->rcu, free_device); 534 call_rcu(&device->rcu, free_device);
@@ -547,6 +568,7 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
547static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 568static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
548 fmode_t flags, void *holder) 569 fmode_t flags, void *holder)
549{ 570{
571 struct request_queue *q;
550 struct block_device *bdev; 572 struct block_device *bdev;
551 struct list_head *head = &fs_devices->devices; 573 struct list_head *head = &fs_devices->devices;
552 struct btrfs_device *device; 574 struct btrfs_device *device;
@@ -603,6 +625,12 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
603 seeding = 0; 625 seeding = 0;
604 } 626 }
605 627
628 q = bdev_get_queue(bdev);
629 if (blk_queue_discard(q)) {
630 device->can_discard = 1;
631 fs_devices->num_can_discard++;
632 }
633
606 device->bdev = bdev; 634 device->bdev = bdev;
607 device->in_fs_metadata = 0; 635 device->in_fs_metadata = 0;
608 device->mode = flags; 636 device->mode = flags;
@@ -835,6 +863,7 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
835 863
836 max_hole_start = search_start; 864 max_hole_start = search_start;
837 max_hole_size = 0; 865 max_hole_size = 0;
866 hole_size = 0;
838 867
839 if (search_start >= search_end) { 868 if (search_start >= search_end) {
840 ret = -ENOSPC; 869 ret = -ENOSPC;
@@ -917,7 +946,14 @@ next:
917 cond_resched(); 946 cond_resched();
918 } 947 }
919 948
920 hole_size = search_end- search_start; 949 /*
950 * At this point, search_start should be the end of
951 * allocated dev extents, and when shrinking the device,
952 * search_end may be smaller than search_start.
953 */
954 if (search_end > search_start)
955 hole_size = search_end - search_start;
956
921 if (hole_size > max_hole_size) { 957 if (hole_size > max_hole_size) {
922 max_hole_start = search_start; 958 max_hole_start = search_start;
923 max_hole_size = hole_size; 959 max_hole_size = hole_size;
@@ -1037,7 +1073,8 @@ static noinline int find_next_chunk(struct btrfs_root *root,
1037 struct btrfs_key found_key; 1073 struct btrfs_key found_key;
1038 1074
1039 path = btrfs_alloc_path(); 1075 path = btrfs_alloc_path();
1040 BUG_ON(!path); 1076 if (!path)
1077 return -ENOMEM;
1041 1078
1042 key.objectid = objectid; 1079 key.objectid = objectid;
1043 key.offset = (u64)-1; 1080 key.offset = (u64)-1;
@@ -1542,6 +1579,7 @@ error:
1542 1579
1543int btrfs_init_new_device(struct btrfs_root *root, char *device_path) 1580int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1544{ 1581{
1582 struct request_queue *q;
1545 struct btrfs_trans_handle *trans; 1583 struct btrfs_trans_handle *trans;
1546 struct btrfs_device *device; 1584 struct btrfs_device *device;
1547 struct block_device *bdev; 1585 struct block_device *bdev;
@@ -1611,6 +1649,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1611 1649
1612 lock_chunks(root); 1650 lock_chunks(root);
1613 1651
1652 q = bdev_get_queue(bdev);
1653 if (blk_queue_discard(q))
1654 device->can_discard = 1;
1614 device->writeable = 1; 1655 device->writeable = 1;
1615 device->work.func = pending_bios_fn; 1656 device->work.func = pending_bios_fn;
1616 generate_random_uuid(device->uuid); 1657 generate_random_uuid(device->uuid);
@@ -1646,6 +1687,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1646 root->fs_info->fs_devices->num_devices++; 1687 root->fs_info->fs_devices->num_devices++;
1647 root->fs_info->fs_devices->open_devices++; 1688 root->fs_info->fs_devices->open_devices++;
1648 root->fs_info->fs_devices->rw_devices++; 1689 root->fs_info->fs_devices->rw_devices++;
1690 if (device->can_discard)
1691 root->fs_info->fs_devices->num_can_discard++;
1649 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1650 1693
1651 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1694 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
@@ -2061,8 +2104,10 @@ int btrfs_balance(struct btrfs_root *dev_root)
2061 2104
2062 /* step two, relocate all the chunks */ 2105 /* step two, relocate all the chunks */
2063 path = btrfs_alloc_path(); 2106 path = btrfs_alloc_path();
2064 BUG_ON(!path); 2107 if (!path) {
2065 2108 ret = -ENOMEM;
2109 goto error;
2110 }
2066 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; 2111 key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2067 key.offset = (u64)-1; 2112 key.offset = (u64)-1;
2068 key.type = BTRFS_CHUNK_ITEM_KEY; 2113 key.type = BTRFS_CHUNK_ITEM_KEY;
@@ -2410,9 +2455,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2410 total_avail = device->total_bytes - device->bytes_used; 2455 total_avail = device->total_bytes - device->bytes_used;
2411 else 2456 else
2412 total_avail = 0; 2457 total_avail = 0;
2413 /* avail is off by max(alloc_start, 1MB), but that is the same 2458
2414 * for all devices, so it doesn't hurt the sorting later on 2459 /* If there is no space on this device, skip it. */
2415 */ 2460 if (total_avail == 0)
2461 continue;
2416 2462
2417 ret = find_free_dev_extent(trans, device, 2463 ret = find_free_dev_extent(trans, device,
2418 max_stripe_size * dev_stripes, 2464 max_stripe_size * dev_stripes,
@@ -2661,7 +2707,8 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
2661 2707
2662 ret = find_next_chunk(fs_info->chunk_root, 2708 ret = find_next_chunk(fs_info->chunk_root,
2663 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); 2709 BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
2664 BUG_ON(ret); 2710 if (ret)
2711 return ret;
2665 2712
2666 alloc_profile = BTRFS_BLOCK_GROUP_METADATA | 2713 alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
2667 (fs_info->metadata_alloc_profile & 2714 (fs_info->metadata_alloc_profile &
@@ -3595,7 +3642,7 @@ int btrfs_read_sys_array(struct btrfs_root *root)
3595 if (!sb) 3642 if (!sb)
3596 return -ENOMEM; 3643 return -ENOMEM;
3597 btrfs_set_buffer_uptodate(sb); 3644 btrfs_set_buffer_uptodate(sb);
3598 btrfs_set_buffer_lockdep_class(sb, 0); 3645 btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
3599 3646
3600 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); 3647 write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
3601 array_size = btrfs_super_sys_array_size(super_copy); 3648 array_size = btrfs_super_sys_array_size(super_copy);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 7c12d61ae7a..6d866db4e17 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -48,6 +48,7 @@ struct btrfs_device {
48 int writeable; 48 int writeable;
49 int in_fs_metadata; 49 int in_fs_metadata;
50 int missing; 50 int missing;
51 int can_discard;
51 52
52 spinlock_t io_lock; 53 spinlock_t io_lock;
53 54
@@ -104,6 +105,7 @@ struct btrfs_fs_devices {
104 u64 rw_devices; 105 u64 rw_devices;
105 u64 missing_devices; 106 u64 missing_devices;
106 u64 total_rw_bytes; 107 u64 total_rw_bytes;
108 u64 num_can_discard;
107 struct block_device *latest_bdev; 109 struct block_device *latest_bdev;
108 110
109 /* all of the devices in the FS, protected by a mutex 111 /* all of the devices in the FS, protected by a mutex
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 5366fe452ab..69565e5fc6a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -102,48 +102,71 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
102 if (!path) 102 if (!path)
103 return -ENOMEM; 103 return -ENOMEM;
104 104
105 /* first lets see if we already have this xattr */ 105 if (flags & XATTR_REPLACE) {
106 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name, 106 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
107 strlen(name), -1); 107 name_len, -1);
108 if (IS_ERR(di)) { 108 if (IS_ERR(di)) {
109 ret = PTR_ERR(di); 109 ret = PTR_ERR(di);
110 goto out; 110 goto out;
111 } 111 } else if (!di) {
112 112 ret = -ENODATA;
113 /* ok we already have this xattr, lets remove it */
114 if (di) {
115 /* if we want create only exit */
116 if (flags & XATTR_CREATE) {
117 ret = -EEXIST;
118 goto out; 113 goto out;
119 } 114 }
120
121 ret = btrfs_delete_one_dir_name(trans, root, path, di); 115 ret = btrfs_delete_one_dir_name(trans, root, path, di);
122 BUG_ON(ret); 116 if (ret)
117 goto out;
123 btrfs_release_path(path); 118 btrfs_release_path(path);
124 119
125 /* if we don't have a value then we are removing the xattr */ 120 /*
121 * remove the attribute
122 */
126 if (!value) 123 if (!value)
127 goto out; 124 goto out;
128 } else { 125 }
126
127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size);
130 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE)
132 goto out;
133 /*
134 * We can't use the path we already have since we won't have the
135 * proper locking for a delete, so release the path and
136 * re-lookup to delete the thing.
137 */
129 btrfs_release_path(path); 138 btrfs_release_path(path);
139 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
140 name, name_len, -1);
141 if (IS_ERR(di)) {
142 ret = PTR_ERR(di);
143 goto out;
144 } else if (!di) {
145 /* Shouldn't happen but just in case... */
146 btrfs_release_path(path);
147 goto again;
148 }
130 149
131 if (flags & XATTR_REPLACE) { 150 ret = btrfs_delete_one_dir_name(trans, root, path, di);
132 /* we couldn't find the attr to replace */ 151 if (ret)
133 ret = -ENODATA;
134 goto out; 152 goto out;
153
154 /*
155 * We have a value to set, so go back and try to insert it now.
156 */
157 if (value) {
158 btrfs_release_path(path);
159 goto again;
135 } 160 }
136 } 161 }
137
138 /* ok we have to create a completely new xattr */
139 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
140 name, name_len, value, size);
141 BUG_ON(ret);
142out: 162out:
143 btrfs_free_path(path); 163 btrfs_free_path(path);
144 return ret; 164 return ret;
145} 165}
146 166
167/*
168 * @value: "" makes the attribute to empty, NULL removes it
169 */
147int __btrfs_setxattr(struct btrfs_trans_handle *trans, 170int __btrfs_setxattr(struct btrfs_trans_handle *trans,
148 struct inode *inode, const char *name, 171 struct inode *inode, const char *name,
149 const void *value, size_t size, int flags) 172 const void *value, size_t size, int flags)