aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorMichal Marek <mmarek@suse.cz>2011-03-09 10:15:44 -0500
committerMichal Marek <mmarek@suse.cz>2011-03-09 10:15:44 -0500
commit2d8ad8719591fa803b0d589ed057fa46f49b7155 (patch)
tree4ae051577dad1161c91dafbf4207bb10a9dc91bb /fs/btrfs
parent9b4ce7bce5f30712fd926ab4599a803314a07719 (diff)
parentc56eb8fb6dccb83d9fe62fd4dc00c834de9bc470 (diff)
Merge commit 'v2.6.38-rc1' into kbuild/packaging
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/acl.c36
-rw-r--r--fs/btrfs/async-thread.c2
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/compression.c369
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c308
-rw-r--r--fs/btrfs/ctree.h321
-rw-r--r--fs/btrfs/delayed-ref.c102
-rw-r--r--fs/btrfs/delayed-ref.h3
-rw-r--r--fs/btrfs/dir-item.c2
-rw-r--r--fs/btrfs/disk-io.c733
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/export.c90
-rw-r--r--fs/btrfs/extent-tree.c2805
-rw-r--r--fs/btrfs/extent_io.c439
-rw-r--r--fs/btrfs/extent_io.h48
-rw-r--r--fs/btrfs/extent_map.c9
-rw-r--r--fs/btrfs/extent_map.h3
-rw-r--r--fs/btrfs/file-item.c29
-rw-r--r--fs/btrfs/file.c406
-rw-r--r--fs/btrfs/free-space-cache.c758
-rw-r--r--fs/btrfs/free-space-cache.h18
-rw-r--r--fs/btrfs/inode-item.c27
-rw-r--r--fs/btrfs/inode.c2552
-rw-r--r--fs/btrfs/ioctl.c1358
-rw-r--r--fs/btrfs/ioctl.h138
-rw-r--r--fs/btrfs/locking.c1
-rw-r--r--fs/btrfs/lzo.c420
-rw-r--r--fs/btrfs/ordered-data.c209
-rw-r--r--fs/btrfs/ordered-data.h29
-rw-r--r--fs/btrfs/orphan.c6
-rw-r--r--fs/btrfs/ref-cache.c1
-rw-r--r--fs/btrfs/ref-cache.h2
-rw-r--r--fs/btrfs/relocation.c2067
-rw-r--r--fs/btrfs/root-tree.c28
-rw-r--r--fs/btrfs/super.c599
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/transaction.c550
-rw-r--r--fs/btrfs/transaction.h33
-rw-r--r--fs/btrfs/tree-defrag.c9
-rw-r--r--fs/btrfs/tree-log.c282
-rw-r--r--fs/btrfs/tree-log.h2
-rw-r--r--fs/btrfs/volumes.c764
-rw-r--r--fs/btrfs/volumes.h32
-rw-r--r--fs/btrfs/xattr.c34
-rw-r--r--fs/btrfs/xattr.h6
-rw-r--r--fs/btrfs/zlib.c374
49 files changed, 11815 insertions, 4282 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
4 select LIBCRC32C 4 select LIBCRC32C
5 select ZLIB_INFLATE 5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS
8 select LZO_DECOMPRESS
7 help 9 help
8 Btrfs is a new filesystem with extents, writable snapshotting, 10 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features. 11 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..31610ea73aec 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 6df6d6ed74fd..15b5ca2a2606 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -22,6 +22,7 @@
22#include <linux/posix_acl_xattr.h> 22#include <linux/posix_acl_xattr.h>
23#include <linux/posix_acl.h> 23#include <linux/posix_acl.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/slab.h>
25 26
26#include "ctree.h" 27#include "ctree.h"
27#include "btrfs_inode.h" 28#include "btrfs_inode.h"
@@ -59,6 +60,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 size = __btrfs_getxattr(inode, name, value, size); 60 size = __btrfs_getxattr(inode, name, value, size);
60 if (size > 0) { 61 if (size > 0) {
61 acl = posix_acl_from_xattr(value, size); 62 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl)) {
64 kfree(value);
65 return acl;
66 }
62 set_cached_acl(inode, type, acl); 67 set_cached_acl(inode, type, acl);
63 } 68 }
64 kfree(value); 69 kfree(value);
@@ -159,6 +164,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
159 int ret; 164 int ret;
160 struct posix_acl *acl = NULL; 165 struct posix_acl *acl = NULL;
161 166
167 if (!is_owner_or_cap(dentry->d_inode))
168 return -EPERM;
169
170 if (!IS_POSIXACL(dentry->d_inode))
171 return -EOPNOTSUPP;
172
162 if (value) { 173 if (value) {
163 acl = posix_acl_from_xattr(value, size); 174 acl = posix_acl_from_xattr(value, size);
164 if (acl == NULL) { 175 if (acl == NULL) {
@@ -176,18 +187,23 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
176 return ret; 187 return ret;
177} 188}
178 189
179int btrfs_check_acl(struct inode *inode, int mask) 190int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
180{ 191{
181 struct posix_acl *acl;
182 int error = -EAGAIN; 192 int error = -EAGAIN;
183 193
184 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); 194 if (flags & IPERM_FLAG_RCU) {
195 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
196 error = -ECHILD;
185 197
186 if (IS_ERR(acl)) 198 } else {
187 return PTR_ERR(acl); 199 struct posix_acl *acl;
188 if (acl) { 200 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
189 error = posix_acl_permission(inode, acl, mask); 201 if (IS_ERR(acl))
190 posix_acl_release(acl); 202 return PTR_ERR(acl);
203 if (acl) {
204 error = posix_acl_permission(inode, acl, mask);
205 posix_acl_release(acl);
206 }
191 } 207 }
192 208
193 return error; 209 return error;
@@ -281,14 +297,14 @@ int btrfs_acl_chmod(struct inode *inode)
281 return ret; 297 return ret;
282} 298}
283 299
284struct xattr_handler btrfs_xattr_acl_default_handler = { 300const struct xattr_handler btrfs_xattr_acl_default_handler = {
285 .prefix = POSIX_ACL_XATTR_DEFAULT, 301 .prefix = POSIX_ACL_XATTR_DEFAULT,
286 .flags = ACL_TYPE_DEFAULT, 302 .flags = ACL_TYPE_DEFAULT,
287 .get = btrfs_xattr_acl_get, 303 .get = btrfs_xattr_acl_get,
288 .set = btrfs_xattr_acl_set, 304 .set = btrfs_xattr_acl_set,
289}; 305};
290 306
291struct xattr_handler btrfs_xattr_acl_access_handler = { 307const struct xattr_handler btrfs_xattr_acl_access_handler = {
292 .prefix = POSIX_ACL_XATTR_ACCESS, 308 .prefix = POSIX_ACL_XATTR_ACCESS,
293 .flags = ACL_TYPE_ACCESS, 309 .flags = ACL_TYPE_ACCESS,
294 .get = btrfs_xattr_acl_get, 310 .get = btrfs_xattr_acl_get,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index c0861e781cdb..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/kthread.h> 19#include <linux/kthread.h>
20#include <linux/slab.h>
20#include <linux/list.h> 21#include <linux/list.h>
21#include <linux/spinlock.h> 22#include <linux/spinlock.h>
22#include <linux/freezer.h> 23#include <linux/freezer.h>
@@ -376,6 +377,7 @@ again:
376 if (!list_empty(&worker->pending) || 377 if (!list_empty(&worker->pending) ||
377 !list_empty(&worker->prio_pending)) { 378 !list_empty(&worker->prio_pending)) {
378 spin_unlock_irq(&worker->lock); 379 spin_unlock_irq(&worker->lock);
380 set_current_state(TASK_RUNNING);
379 goto again; 381 goto again;
380 } 382 }
381 383
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..ccc991c542df 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
137 * of extent items we've reserved metadata for. 137 * of extent items we've reserved metadata for.
138 */ 138 */
139 spinlock_t accounting_lock; 139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents;
140 int reserved_extents; 141 int reserved_extents;
141 int outstanding_extents;
142 142
143 /* 143 /*
144 * ordered_data_close is set by truncate when a file that used 144 * ordered_data_close is set by truncate when a file that used
@@ -151,8 +151,14 @@ struct btrfs_inode {
151 * of these. 151 * of these.
152 */ 152 */
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1;
154 unsigned dummy_inode:1; 155 unsigned dummy_inode:1;
155 156
157 /*
158 * always compress this one file
159 */
160 unsigned force_compress:4;
161
156 struct inode vfs_inode; 162 struct inode vfs_inode;
157}; 163};
158 164
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..f745287fbf2e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -31,7 +31,7 @@
31#include <linux/swap.h> 31#include <linux/swap.h>
32#include <linux/writeback.h> 32#include <linux/writeback.h>
33#include <linux/bit_spinlock.h> 33#include <linux/bit_spinlock.h>
34#include <linux/pagevec.h> 34#include <linux/slab.h>
35#include "compat.h" 35#include "compat.h"
36#include "ctree.h" 36#include "ctree.h"
37#include "disk-io.h" 37#include "disk-io.h"
@@ -62,6 +62,9 @@ struct compressed_bio {
62 /* number of bytes on disk */ 62 /* number of bytes on disk */
63 unsigned long compressed_len; 63 unsigned long compressed_len;
64 64
65 /* the compression algorithm for this bio */
66 int compress_type;
67
65 /* number of compressed pages in the array */ 68 /* number of compressed pages in the array */
66 unsigned long nr_pages; 69 unsigned long nr_pages;
67 70
@@ -91,23 +94,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
91static struct bio *compressed_bio_alloc(struct block_device *bdev, 94static struct bio *compressed_bio_alloc(struct block_device *bdev,
92 u64 first_byte, gfp_t gfp_flags) 95 u64 first_byte, gfp_t gfp_flags)
93{ 96{
94 struct bio *bio;
95 int nr_vecs; 97 int nr_vecs;
96 98
97 nr_vecs = bio_get_nr_vecs(bdev); 99 nr_vecs = bio_get_nr_vecs(bdev);
98 bio = bio_alloc(gfp_flags, nr_vecs); 100 return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
99
100 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
101 while (!bio && (nr_vecs /= 2))
102 bio = bio_alloc(gfp_flags, nr_vecs);
103 }
104
105 if (bio) {
106 bio->bi_size = 0;
107 bio->bi_bdev = bdev;
108 bio->bi_sector = first_byte >> 9;
109 }
110 return bio;
111} 101}
112 102
113static int check_compressed_csum(struct inode *inode, 103static int check_compressed_csum(struct inode *inode,
@@ -163,7 +153,6 @@ fail:
163 */ 153 */
164static void end_compressed_bio_read(struct bio *bio, int err) 154static void end_compressed_bio_read(struct bio *bio, int err)
165{ 155{
166 struct extent_io_tree *tree;
167 struct compressed_bio *cb = bio->bi_private; 156 struct compressed_bio *cb = bio->bi_private;
168 struct inode *inode; 157 struct inode *inode;
169 struct page *page; 158 struct page *page;
@@ -187,12 +176,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
187 /* ok, we're the last bio for this extent, lets start 176 /* ok, we're the last bio for this extent, lets start
188 * the decompression. 177 * the decompression.
189 */ 178 */
190 tree = &BTRFS_I(inode)->io_tree; 179 ret = btrfs_decompress_biovec(cb->compress_type,
191 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 180 cb->compressed_pages,
192 cb->start, 181 cb->start,
193 cb->orig_bio->bi_io_vec, 182 cb->orig_bio->bi_io_vec,
194 cb->orig_bio->bi_vcnt, 183 cb->orig_bio->bi_vcnt,
195 cb->compressed_len); 184 cb->compressed_len);
196csum_failed: 185csum_failed:
197 if (ret) 186 if (ret)
198 cb->errors = 1; 187 cb->errors = 1;
@@ -445,7 +434,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
445 unsigned long nr_pages = 0; 434 unsigned long nr_pages = 0;
446 struct extent_map *em; 435 struct extent_map *em;
447 struct address_space *mapping = inode->i_mapping; 436 struct address_space *mapping = inode->i_mapping;
448 struct pagevec pvec;
449 struct extent_map_tree *em_tree; 437 struct extent_map_tree *em_tree;
450 struct extent_io_tree *tree; 438 struct extent_io_tree *tree;
451 u64 end; 439 u64 end;
@@ -461,7 +449,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 449
462 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 450 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
463 451
464 pagevec_init(&pvec, 0);
465 while (last_offset < compressed_end) { 452 while (last_offset < compressed_end) {
466 page_index = last_offset >> PAGE_CACHE_SHIFT; 453 page_index = last_offset >> PAGE_CACHE_SHIFT;
467 454
@@ -478,26 +465,17 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 465 goto next;
479 } 466 }
480 467
481 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); 468 page = __page_cache_alloc(mapping_gfp_mask(mapping) &
469 ~__GFP_FS);
482 if (!page) 470 if (!page)
483 break; 471 break;
484 472
485 page->index = page_index; 473 if (add_to_page_cache_lru(page, mapping, page_index,
486 /* 474 GFP_NOFS)) {
487 * what we want to do here is call add_to_page_cache_lru,
488 * but that isn't exported, so we reproduce it here
489 */
490 if (add_to_page_cache(page, mapping,
491 page->index, GFP_NOFS)) {
492 page_cache_release(page); 475 page_cache_release(page);
493 goto next; 476 goto next;
494 } 477 }
495 478
496 /* open coding of lru_cache_add, also not exported */
497 page_cache_get(page);
498 if (!pagevec_add(&pvec, page))
499 __pagevec_lru_add_file(&pvec);
500
501 end = last_offset + PAGE_CACHE_SIZE - 1; 479 end = last_offset + PAGE_CACHE_SIZE - 1;
502 /* 480 /*
503 * at this point, we have a locked page in the page cache 481 * at this point, we have a locked page in the page cache
@@ -551,8 +529,6 @@ static noinline int add_ra_bio_pages(struct inode *inode,
551next: 529next:
552 last_offset += PAGE_CACHE_SIZE; 530 last_offset += PAGE_CACHE_SIZE;
553 } 531 }
554 if (pagevec_count(&pvec))
555 __pagevec_lru_add_file(&pvec);
556 return 0; 532 return 0;
557} 533}
558 534
@@ -616,6 +592,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
616 592
617 cb->len = uncompressed_len; 593 cb->len = uncompressed_len;
618 cb->compressed_len = compressed_len; 594 cb->compressed_len = compressed_len;
595 cb->compress_type = extent_compress_type(bio_flags);
619 cb->orig_bio = bio; 596 cb->orig_bio = bio;
620 597
621 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 598 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
@@ -705,3 +682,317 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
705 bio_put(comp_bio); 682 bio_put(comp_bio);
706 return 0; 683 return 0;
707} 684}
685
686static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
687static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
688static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
689static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
690static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
691
692struct btrfs_compress_op *btrfs_compress_op[] = {
693 &btrfs_zlib_compress,
694 &btrfs_lzo_compress,
695};
696
697int __init btrfs_init_compress(void)
698{
699 int i;
700
701 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
702 INIT_LIST_HEAD(&comp_idle_workspace[i]);
703 spin_lock_init(&comp_workspace_lock[i]);
704 atomic_set(&comp_alloc_workspace[i], 0);
705 init_waitqueue_head(&comp_workspace_wait[i]);
706 }
707 return 0;
708}
709
710/*
711 * this finds an available workspace or allocates a new one
712 * ERR_PTR is returned if things go bad.
713 */
714static struct list_head *find_workspace(int type)
715{
716 struct list_head *workspace;
717 int cpus = num_online_cpus();
718 int idx = type - 1;
719
720 struct list_head *idle_workspace = &comp_idle_workspace[idx];
721 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
722 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
723 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
724 int *num_workspace = &comp_num_workspace[idx];
725again:
726 spin_lock(workspace_lock);
727 if (!list_empty(idle_workspace)) {
728 workspace = idle_workspace->next;
729 list_del(workspace);
730 (*num_workspace)--;
731 spin_unlock(workspace_lock);
732 return workspace;
733
734 }
735 if (atomic_read(alloc_workspace) > cpus) {
736 DEFINE_WAIT(wait);
737
738 spin_unlock(workspace_lock);
739 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
740 if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
741 schedule();
742 finish_wait(workspace_wait, &wait);
743 goto again;
744 }
745 atomic_inc(alloc_workspace);
746 spin_unlock(workspace_lock);
747
748 workspace = btrfs_compress_op[idx]->alloc_workspace();
749 if (IS_ERR(workspace)) {
750 atomic_dec(alloc_workspace);
751 wake_up(workspace_wait);
752 }
753 return workspace;
754}
755
756/*
757 * put a workspace struct back on the list or free it if we have enough
758 * idle ones sitting around
759 */
760static void free_workspace(int type, struct list_head *workspace)
761{
762 int idx = type - 1;
763 struct list_head *idle_workspace = &comp_idle_workspace[idx];
764 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
765 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
766 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
767 int *num_workspace = &comp_num_workspace[idx];
768
769 spin_lock(workspace_lock);
770 if (*num_workspace < num_online_cpus()) {
771 list_add_tail(workspace, idle_workspace);
772 (*num_workspace)++;
773 spin_unlock(workspace_lock);
774 goto wake;
775 }
776 spin_unlock(workspace_lock);
777
778 btrfs_compress_op[idx]->free_workspace(workspace);
779 atomic_dec(alloc_workspace);
780wake:
781 if (waitqueue_active(workspace_wait))
782 wake_up(workspace_wait);
783}
784
785/*
786 * cleanup function for module exit
787 */
788static void free_workspaces(void)
789{
790 struct list_head *workspace;
791 int i;
792
793 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
794 while (!list_empty(&comp_idle_workspace[i])) {
795 workspace = comp_idle_workspace[i].next;
796 list_del(workspace);
797 btrfs_compress_op[i]->free_workspace(workspace);
798 atomic_dec(&comp_alloc_workspace[i]);
799 }
800 }
801}
802
803/*
804 * given an address space and start/len, compress the bytes.
805 *
806 * pages are allocated to hold the compressed result and stored
807 * in 'pages'
808 *
809 * out_pages is used to return the number of pages allocated. There
810 * may be pages allocated even if we return an error
811 *
812 * total_in is used to return the number of bytes actually read. It
813 * may be smaller then len if we had to exit early because we
814 * ran out of room in the pages array or because we cross the
815 * max_out threshold.
816 *
817 * total_out is used to return the total number of compressed bytes
818 *
819 * max_out tells us the max number of bytes that we're allowed to
820 * stuff into pages
821 */
822int btrfs_compress_pages(int type, struct address_space *mapping,
823 u64 start, unsigned long len,
824 struct page **pages,
825 unsigned long nr_dest_pages,
826 unsigned long *out_pages,
827 unsigned long *total_in,
828 unsigned long *total_out,
829 unsigned long max_out)
830{
831 struct list_head *workspace;
832 int ret;
833
834 workspace = find_workspace(type);
835 if (IS_ERR(workspace))
836 return -1;
837
838 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
839 start, len, pages,
840 nr_dest_pages, out_pages,
841 total_in, total_out,
842 max_out);
843 free_workspace(type, workspace);
844 return ret;
845}
846
847/*
848 * pages_in is an array of pages with compressed data.
849 *
850 * disk_start is the starting logical offset of this array in the file
851 *
852 * bvec is a bio_vec of pages from the file that we want to decompress into
853 *
854 * vcnt is the count of pages in the biovec
855 *
856 * srclen is the number of bytes in pages_in
857 *
858 * The basic idea is that we have a bio that was created by readpages.
859 * The pages in the bio are for the uncompressed data, and they may not
860 * be contiguous. They all correspond to the range of bytes covered by
861 * the compressed extent.
862 */
863int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
864 struct bio_vec *bvec, int vcnt, size_t srclen)
865{
866 struct list_head *workspace;
867 int ret;
868
869 workspace = find_workspace(type);
870 if (IS_ERR(workspace))
871 return -ENOMEM;
872
873 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
874 disk_start,
875 bvec, vcnt, srclen);
876 free_workspace(type, workspace);
877 return ret;
878}
879
880/*
881 * a less complex decompression routine. Our compressed data fits in a
882 * single page, and we want to read a single page out of it.
883 * start_byte tells us the offset into the compressed data we're interested in
884 */
885int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
886 unsigned long start_byte, size_t srclen, size_t destlen)
887{
888 struct list_head *workspace;
889 int ret;
890
891 workspace = find_workspace(type);
892 if (IS_ERR(workspace))
893 return -ENOMEM;
894
895 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
896 dest_page, start_byte,
897 srclen, destlen);
898
899 free_workspace(type, workspace);
900 return ret;
901}
902
903void __exit btrfs_exit_compress(void)
904{
905 free_workspaces();
906}
907
908/*
909 * Copy uncompressed data from working buffer to pages.
910 *
911 * buf_start is the byte offset we're of the start of our workspace buffer.
912 *
913 * total_out is the last byte of the buffer
914 */
915int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
916 unsigned long total_out, u64 disk_start,
917 struct bio_vec *bvec, int vcnt,
918 unsigned long *page_index,
919 unsigned long *pg_offset)
920{
921 unsigned long buf_offset;
922 unsigned long current_buf_start;
923 unsigned long start_byte;
924 unsigned long working_bytes = total_out - buf_start;
925 unsigned long bytes;
926 char *kaddr;
927 struct page *page_out = bvec[*page_index].bv_page;
928
929 /*
930 * start byte is the first byte of the page we're currently
931 * copying into relative to the start of the compressed data.
932 */
933 start_byte = page_offset(page_out) - disk_start;
934
935 /* we haven't yet hit data corresponding to this page */
936 if (total_out <= start_byte)
937 return 1;
938
939 /*
940 * the start of the data we care about is offset into
941 * the middle of our working buffer
942 */
943 if (total_out > start_byte && buf_start < start_byte) {
944 buf_offset = start_byte - buf_start;
945 working_bytes -= buf_offset;
946 } else {
947 buf_offset = 0;
948 }
949 current_buf_start = buf_start;
950
951 /* copy bytes from the working buffer into the pages */
952 while (working_bytes > 0) {
953 bytes = min(PAGE_CACHE_SIZE - *pg_offset,
954 PAGE_CACHE_SIZE - buf_offset);
955 bytes = min(bytes, working_bytes);
956 kaddr = kmap_atomic(page_out, KM_USER0);
957 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
958 kunmap_atomic(kaddr, KM_USER0);
959 flush_dcache_page(page_out);
960
961 *pg_offset += bytes;
962 buf_offset += bytes;
963 working_bytes -= bytes;
964 current_buf_start += bytes;
965
966 /* check if we need to pick another page */
967 if (*pg_offset == PAGE_CACHE_SIZE) {
968 (*page_index)++;
969 if (*page_index >= vcnt)
970 return 0;
971
972 page_out = bvec[*page_index].bv_page;
973 *pg_offset = 0;
974 start_byte = page_offset(page_out) - disk_start;
975
976 /*
977 * make sure our new page is covered by this
978 * working buffer
979 */
980 if (total_out <= start_byte)
981 return 1;
982
983 /*
984 * the next page in the biovec might not be adjacent
985 * to the last page, but it might still be found
986 * inside this working buffer. bump our offset pointer
987 */
988 if (total_out > start_byte &&
989 current_buf_start < start_byte) {
990 buf_offset = start_byte - buf_start;
991 working_bytes = total_out - start_byte;
992 current_buf_start = buf_start + buf_offset;
993 }
994 }
995 }
996
997 return 1;
998}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..51000174b9d7 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
19#ifndef __BTRFS_COMPRESSION_ 19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_ 20#define __BTRFS_COMPRESSION_
21 21
22int btrfs_zlib_decompress(unsigned char *data_in, 22int btrfs_init_compress(void);
23 struct page *dest_page, 23void btrfs_exit_compress(void);
24 unsigned long start_byte, 24
25 size_t srclen, size_t destlen); 25int btrfs_compress_pages(int type, struct address_space *mapping,
26int btrfs_zlib_compress_pages(struct address_space *mapping, 26 u64 start, unsigned long len,
27 u64 start, unsigned long len, 27 struct page **pages,
28 struct page **pages, 28 unsigned long nr_dest_pages,
29 unsigned long nr_dest_pages, 29 unsigned long *out_pages,
30 unsigned long *out_pages, 30 unsigned long *total_in,
31 unsigned long *total_in, 31 unsigned long *total_out,
32 unsigned long *total_out, 32 unsigned long max_out);
33 unsigned long max_out); 33int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
34int btrfs_zlib_decompress_biovec(struct page **pages_in, 34 struct bio_vec *bvec, int vcnt, size_t srclen);
35 u64 disk_start, 35int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
36 struct bio_vec *bvec, 36 unsigned long start_byte, size_t srclen, size_t destlen);
37 int vcnt, 37int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
38 size_t srclen); 38 unsigned long total_out, u64 disk_start,
39void btrfs_zlib_exit(void); 39 struct bio_vec *bvec, int vcnt,
40 unsigned long *page_index,
41 unsigned long *pg_offset);
42
40int btrfs_submit_compressed_write(struct inode *inode, u64 start, 43int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start, 44 unsigned long len, u64 disk_start,
42 unsigned long compressed_len, 45 unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
44 unsigned long nr_pages); 47 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 48int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags); 49 int mirror_num, unsigned long bio_flags);
50
51struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void);
53
54 void (*free_workspace)(struct list_head *workspace);
55
56 int (*compress_pages)(struct list_head *workspace,
57 struct address_space *mapping,
58 u64 start, unsigned long len,
59 struct page **pages,
60 unsigned long nr_dest_pages,
61 unsigned long *out_pages,
62 unsigned long *total_in,
63 unsigned long *total_out,
64 unsigned long max_out);
65
66 int (*decompress_biovec)(struct list_head *workspace,
67 struct page **pages_in,
68 u64 disk_start,
69 struct bio_vec *bvec,
70 int vcnt,
71 size_t srclen);
72
73 int (*decompress)(struct list_head *workspace,
74 unsigned char *data_in,
75 struct page *dest_page,
76 unsigned long start_byte,
77 size_t srclen, size_t destlen);
78};
79
80extern struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress;
82
47#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c4bc570a396e..b5baff0dccfe 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "disk-io.h" 22#include "disk-io.h"
22#include "transaction.h" 23#include "transaction.h"
@@ -104,6 +105,8 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
104/* this also releases the path */ 105/* this also releases the path */
105void btrfs_free_path(struct btrfs_path *p) 106void btrfs_free_path(struct btrfs_path *p)
106{ 107{
108 if (!p)
109 return;
107 btrfs_release_path(NULL, p); 110 btrfs_release_path(NULL, p);
108 kmem_cache_free(btrfs_path_cachep, p); 111 kmem_cache_free(btrfs_path_cachep, p);
109} 112}
@@ -199,7 +202,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
199 struct extent_buffer **cow_ret, u64 new_root_objectid) 202 struct extent_buffer **cow_ret, u64 new_root_objectid)
200{ 203{
201 struct extent_buffer *cow; 204 struct extent_buffer *cow;
202 u32 nritems;
203 int ret = 0; 205 int ret = 0;
204 int level; 206 int level;
205 struct btrfs_disk_key disk_key; 207 struct btrfs_disk_key disk_key;
@@ -209,7 +211,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
209 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 211 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
210 212
211 level = btrfs_header_level(buf); 213 level = btrfs_header_level(buf);
212 nritems = btrfs_header_nritems(buf);
213 if (level == 0) 214 if (level == 0)
214 btrfs_item_key(buf, &disk_key, 0); 215 btrfs_item_key(buf, &disk_key, 0);
215 else 216 else
@@ -279,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
279static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, 280static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
280 struct btrfs_root *root, 281 struct btrfs_root *root,
281 struct extent_buffer *buf, 282 struct extent_buffer *buf,
282 struct extent_buffer *cow) 283 struct extent_buffer *cow,
284 int *last_ref)
283{ 285{
284 u64 refs; 286 u64 refs;
285 u64 owner; 287 u64 owner;
@@ -365,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
365 BUG_ON(ret); 367 BUG_ON(ret);
366 } 368 }
367 clean_tree_block(trans, root, buf); 369 clean_tree_block(trans, root, buf);
370 *last_ref = 1;
368 } 371 }
369 return 0; 372 return 0;
370} 373}
@@ -391,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
391 struct btrfs_disk_key disk_key; 394 struct btrfs_disk_key disk_key;
392 struct extent_buffer *cow; 395 struct extent_buffer *cow;
393 int level; 396 int level;
397 int last_ref = 0;
394 int unlock_orig = 0; 398 int unlock_orig = 0;
395 u64 parent_start; 399 u64 parent_start;
396 400
@@ -441,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
441 (unsigned long)btrfs_header_fsid(cow), 445 (unsigned long)btrfs_header_fsid(cow),
442 BTRFS_FSID_SIZE); 446 BTRFS_FSID_SIZE);
443 447
444 update_ref_for_cow(trans, root, buf, cow); 448 update_ref_for_cow(trans, root, buf, cow, &last_ref);
449
450 if (root->ref_cows)
451 btrfs_reloc_cow_block(trans, root, buf, cow);
445 452
446 if (buf == root->node) { 453 if (buf == root->node) {
447 WARN_ON(parent && parent != buf); 454 WARN_ON(parent && parent != buf);
@@ -456,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
456 extent_buffer_get(cow); 463 extent_buffer_get(cow);
457 spin_unlock(&root->node_lock); 464 spin_unlock(&root->node_lock);
458 465
459 btrfs_free_tree_block(trans, root, buf->start, buf->len, 466 btrfs_free_tree_block(trans, root, buf, parent_start,
460 parent_start, root->root_key.objectid, level); 467 last_ref);
461 free_extent_buffer(buf); 468 free_extent_buffer(buf);
462 add_root_to_dirty_list(root); 469 add_root_to_dirty_list(root);
463 } else { 470 } else {
@@ -472,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
472 btrfs_set_node_ptr_generation(parent, parent_slot, 479 btrfs_set_node_ptr_generation(parent, parent_slot,
473 trans->transid); 480 trans->transid);
474 btrfs_mark_buffer_dirty(parent); 481 btrfs_mark_buffer_dirty(parent);
475 btrfs_free_tree_block(trans, root, buf->start, buf->len, 482 btrfs_free_tree_block(trans, root, buf, parent_start,
476 parent_start, root->root_key.objectid, level); 483 last_ref);
477 } 484 }
478 if (unlock_orig) 485 if (unlock_orig)
479 btrfs_tree_unlock(buf); 486 btrfs_tree_unlock(buf);
@@ -948,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
948 return bin_search(eb, key, level, slot); 955 return bin_search(eb, key, level, slot);
949} 956}
950 957
958static void root_add_used(struct btrfs_root *root, u32 size)
959{
960 spin_lock(&root->accounting_lock);
961 btrfs_set_root_used(&root->root_item,
962 btrfs_root_used(&root->root_item) + size);
963 spin_unlock(&root->accounting_lock);
964}
965
966static void root_sub_used(struct btrfs_root *root, u32 size)
967{
968 spin_lock(&root->accounting_lock);
969 btrfs_set_root_used(&root->root_item,
970 btrfs_root_used(&root->root_item) - size);
971 spin_unlock(&root->accounting_lock);
972}
973
951/* given a node and slot number, this reads the blocks it points to. The 974/* given a node and slot number, this reads the blocks it points to. The
952 * extent buffer is returned with a reference taken (but unlocked). 975 * extent buffer is returned with a reference taken (but unlocked).
953 * NULL is returned on error. 976 * NULL is returned on error.
@@ -985,7 +1008,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
985 int wret; 1008 int wret;
986 int pslot; 1009 int pslot;
987 int orig_slot = path->slots[level]; 1010 int orig_slot = path->slots[level];
988 int err_on_enospc = 0;
989 u64 orig_ptr; 1011 u64 orig_ptr;
990 1012
991 if (level == 0) 1013 if (level == 0)
@@ -1018,7 +1040,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1018 btrfs_tree_lock(child); 1040 btrfs_tree_lock(child);
1019 btrfs_set_lock_blocking(child); 1041 btrfs_set_lock_blocking(child);
1020 ret = btrfs_cow_block(trans, root, child, mid, 0, &child); 1042 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1021 BUG_ON(ret); 1043 if (ret) {
1044 btrfs_tree_unlock(child);
1045 free_extent_buffer(child);
1046 goto enospc;
1047 }
1022 1048
1023 spin_lock(&root->node_lock); 1049 spin_lock(&root->node_lock);
1024 root->node = child; 1050 root->node = child;
@@ -1033,18 +1059,18 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1033 btrfs_tree_unlock(mid); 1059 btrfs_tree_unlock(mid);
1034 /* once for the path */ 1060 /* once for the path */
1035 free_extent_buffer(mid); 1061 free_extent_buffer(mid);
1036 ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, 1062
1037 0, root->root_key.objectid, level); 1063 root_sub_used(root, mid->len);
1064 btrfs_free_tree_block(trans, root, mid, 0, 1);
1038 /* once for the root ptr */ 1065 /* once for the root ptr */
1039 free_extent_buffer(mid); 1066 free_extent_buffer(mid);
1040 return ret; 1067 return 0;
1041 } 1068 }
1042 if (btrfs_header_nritems(mid) > 1069 if (btrfs_header_nritems(mid) >
1043 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 1070 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1044 return 0; 1071 return 0;
1045 1072
1046 if (btrfs_header_nritems(mid) < 2) 1073 btrfs_header_nritems(mid);
1047 err_on_enospc = 1;
1048 1074
1049 left = read_node_slot(root, parent, pslot - 1); 1075 left = read_node_slot(root, parent, pslot - 1);
1050 if (left) { 1076 if (left) {
@@ -1075,8 +1101,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1075 wret = push_node_left(trans, root, left, mid, 1); 1101 wret = push_node_left(trans, root, left, mid, 1);
1076 if (wret < 0) 1102 if (wret < 0)
1077 ret = wret; 1103 ret = wret;
1078 if (btrfs_header_nritems(mid) < 2) 1104 btrfs_header_nritems(mid);
1079 err_on_enospc = 1;
1080 } 1105 }
1081 1106
1082 /* 1107 /*
@@ -1087,23 +1112,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1087 if (wret < 0 && wret != -ENOSPC) 1112 if (wret < 0 && wret != -ENOSPC)
1088 ret = wret; 1113 ret = wret;
1089 if (btrfs_header_nritems(right) == 0) { 1114 if (btrfs_header_nritems(right) == 0) {
1090 u64 bytenr = right->start;
1091 u32 blocksize = right->len;
1092
1093 clean_tree_block(trans, root, right); 1115 clean_tree_block(trans, root, right);
1094 btrfs_tree_unlock(right); 1116 btrfs_tree_unlock(right);
1095 free_extent_buffer(right);
1096 right = NULL;
1097 wret = del_ptr(trans, root, path, level + 1, pslot + 1117 wret = del_ptr(trans, root, path, level + 1, pslot +
1098 1); 1118 1);
1099 if (wret) 1119 if (wret)
1100 ret = wret; 1120 ret = wret;
1101 wret = btrfs_free_tree_block(trans, root, 1121 root_sub_used(root, right->len);
1102 bytenr, blocksize, 0, 1122 btrfs_free_tree_block(trans, root, right, 0, 1);
1103 root->root_key.objectid, 1123 free_extent_buffer(right);
1104 level); 1124 right = NULL;
1105 if (wret)
1106 ret = wret;
1107 } else { 1125 } else {
1108 struct btrfs_disk_key right_key; 1126 struct btrfs_disk_key right_key;
1109 btrfs_node_key(right, &right_key, 0); 1127 btrfs_node_key(right, &right_key, 0);
@@ -1135,21 +1153,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1135 BUG_ON(wret == 1); 1153 BUG_ON(wret == 1);
1136 } 1154 }
1137 if (btrfs_header_nritems(mid) == 0) { 1155 if (btrfs_header_nritems(mid) == 0) {
1138 /* we've managed to empty the middle node, drop it */
1139 u64 bytenr = mid->start;
1140 u32 blocksize = mid->len;
1141
1142 clean_tree_block(trans, root, mid); 1156 clean_tree_block(trans, root, mid);
1143 btrfs_tree_unlock(mid); 1157 btrfs_tree_unlock(mid);
1144 free_extent_buffer(mid);
1145 mid = NULL;
1146 wret = del_ptr(trans, root, path, level + 1, pslot); 1158 wret = del_ptr(trans, root, path, level + 1, pslot);
1147 if (wret) 1159 if (wret)
1148 ret = wret; 1160 ret = wret;
1149 wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, 1161 root_sub_used(root, mid->len);
1150 0, root->root_key.objectid, level); 1162 btrfs_free_tree_block(trans, root, mid, 0, 1);
1151 if (wret) 1163 free_extent_buffer(mid);
1152 ret = wret; 1164 mid = NULL;
1153 } else { 1165 } else {
1154 /* update the parent key to reflect our changes */ 1166 /* update the parent key to reflect our changes */
1155 struct btrfs_disk_key mid_key; 1167 struct btrfs_disk_key mid_key;
@@ -1209,14 +1221,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1209 int wret; 1221 int wret;
1210 int pslot; 1222 int pslot;
1211 int orig_slot = path->slots[level]; 1223 int orig_slot = path->slots[level];
1212 u64 orig_ptr;
1213 1224
1214 if (level == 0) 1225 if (level == 0)
1215 return 1; 1226 return 1;
1216 1227
1217 mid = path->nodes[level]; 1228 mid = path->nodes[level];
1218 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1229 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1219 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1220 1230
1221 if (level < BTRFS_MAX_LEVEL - 1) 1231 if (level < BTRFS_MAX_LEVEL - 1)
1222 parent = path->nodes[level + 1]; 1232 parent = path->nodes[level + 1];
@@ -1562,13 +1572,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1562 blocksize = btrfs_level_size(root, level - 1); 1572 blocksize = btrfs_level_size(root, level - 1);
1563 1573
1564 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 1574 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1565 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1575 if (tmp) {
1566 /* 1576 if (btrfs_buffer_uptodate(tmp, 0)) {
1567 * we found an up to date block without sleeping, return 1577 if (btrfs_buffer_uptodate(tmp, gen)) {
1568 * right away 1578 /*
1569 */ 1579 * we found an up to date block without
1570 *eb_ret = tmp; 1580 * sleeping, return
1571 return 0; 1581 * right away
1582 */
1583 *eb_ret = tmp;
1584 return 0;
1585 }
1586 /* the pages were up to date, but we failed
1587 * the generation number check. Do a full
1588 * read for the generation number that is correct.
1589 * We must do this without dropping locks so
1590 * we can trust our generation number
1591 */
1592 free_extent_buffer(tmp);
1593 tmp = read_tree_block(root, blocknr, blocksize, gen);
1594 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1595 *eb_ret = tmp;
1596 return 0;
1597 }
1598 free_extent_buffer(tmp);
1599 btrfs_release_path(NULL, p);
1600 return -EIO;
1601 }
1572 } 1602 }
1573 1603
1574 /* 1604 /*
@@ -1581,15 +1611,14 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1581 btrfs_unlock_up_safe(p, level + 1); 1611 btrfs_unlock_up_safe(p, level + 1);
1582 btrfs_set_path_blocking(p); 1612 btrfs_set_path_blocking(p);
1583 1613
1584 if (tmp) 1614 free_extent_buffer(tmp);
1585 free_extent_buffer(tmp);
1586 if (p->reada) 1615 if (p->reada)
1587 reada_for_search(root, p, level, slot, key->objectid); 1616 reada_for_search(root, p, level, slot, key->objectid);
1588 1617
1589 btrfs_release_path(NULL, p); 1618 btrfs_release_path(NULL, p);
1590 1619
1591 ret = -EAGAIN; 1620 ret = -EAGAIN;
1592 tmp = read_tree_block(root, blocknr, blocksize, gen); 1621 tmp = read_tree_block(root, blocknr, blocksize, 0);
1593 if (tmp) { 1622 if (tmp) {
1594 /* 1623 /*
1595 * If the read above didn't mark this buffer up to date, 1624 * If the read above didn't mark this buffer up to date,
@@ -1739,7 +1768,6 @@ again:
1739 p->nodes[level + 1], 1768 p->nodes[level + 1],
1740 p->slots[level + 1], &b); 1769 p->slots[level + 1], &b);
1741 if (err) { 1770 if (err) {
1742 free_extent_buffer(b);
1743 ret = err; 1771 ret = err;
1744 goto done; 1772 goto done;
1745 } 1773 }
@@ -2075,6 +2103,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2075 if (IS_ERR(c)) 2103 if (IS_ERR(c))
2076 return PTR_ERR(c); 2104 return PTR_ERR(c);
2077 2105
2106 root_add_used(root, root->nodesize);
2107
2078 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header)); 2108 memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
2079 btrfs_set_header_nritems(c, 1); 2109 btrfs_set_header_nritems(c, 1);
2080 btrfs_set_header_level(c, level); 2110 btrfs_set_header_level(c, level);
@@ -2133,6 +2163,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
2133 int nritems; 2163 int nritems;
2134 2164
2135 BUG_ON(!path->nodes[level]); 2165 BUG_ON(!path->nodes[level]);
2166 btrfs_assert_tree_locked(path->nodes[level]);
2136 lower = path->nodes[level]; 2167 lower = path->nodes[level];
2137 nritems = btrfs_header_nritems(lower); 2168 nritems = btrfs_header_nritems(lower);
2138 BUG_ON(slot > nritems); 2169 BUG_ON(slot > nritems);
@@ -2201,6 +2232,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
2201 if (IS_ERR(split)) 2232 if (IS_ERR(split))
2202 return PTR_ERR(split); 2233 return PTR_ERR(split);
2203 2234
2235 root_add_used(root, root->nodesize);
2236
2204 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header)); 2237 memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
2205 btrfs_set_header_level(split, btrfs_header_level(c)); 2238 btrfs_set_header_level(split, btrfs_header_level(c));
2206 btrfs_set_header_bytenr(split, split->start); 2239 btrfs_set_header_bytenr(split, split->start);
@@ -2285,12 +2318,17 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
2285 return ret; 2318 return ret;
2286} 2319}
2287 2320
2321/*
2322 * min slot controls the lowest index we're willing to push to the
2323 * right. We'll push up to and including min_slot, but no lower
2324 */
2288static noinline int __push_leaf_right(struct btrfs_trans_handle *trans, 2325static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2289 struct btrfs_root *root, 2326 struct btrfs_root *root,
2290 struct btrfs_path *path, 2327 struct btrfs_path *path,
2291 int data_size, int empty, 2328 int data_size, int empty,
2292 struct extent_buffer *right, 2329 struct extent_buffer *right,
2293 int free_space, u32 left_nritems) 2330 int free_space, u32 left_nritems,
2331 u32 min_slot)
2294{ 2332{
2295 struct extent_buffer *left = path->nodes[0]; 2333 struct extent_buffer *left = path->nodes[0];
2296 struct extent_buffer *upper = path->nodes[1]; 2334 struct extent_buffer *upper = path->nodes[1];
@@ -2308,7 +2346,7 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2308 if (empty) 2346 if (empty)
2309 nr = 0; 2347 nr = 0;
2310 else 2348 else
2311 nr = 1; 2349 nr = max_t(u32, 1, min_slot);
2312 2350
2313 if (path->slots[0] >= left_nritems) 2351 if (path->slots[0] >= left_nritems)
2314 push_space += data_size; 2352 push_space += data_size;
@@ -2414,6 +2452,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
2414 2452
2415 if (left_nritems) 2453 if (left_nritems)
2416 btrfs_mark_buffer_dirty(left); 2454 btrfs_mark_buffer_dirty(left);
2455 else
2456 clean_tree_block(trans, root, left);
2457
2417 btrfs_mark_buffer_dirty(right); 2458 btrfs_mark_buffer_dirty(right);
2418 2459
2419 btrfs_item_key(right, &disk_key, 0); 2460 btrfs_item_key(right, &disk_key, 0);
@@ -2447,10 +2488,14 @@ out_unlock:
2447 * 2488 *
2448 * returns 1 if the push failed because the other node didn't have enough 2489 * returns 1 if the push failed because the other node didn't have enough
2449 * room, 0 if everything worked out and < 0 if there were major errors. 2490 * room, 0 if everything worked out and < 0 if there were major errors.
2491 *
2492 * this will push starting from min_slot to the end of the leaf. It won't
2493 * push any slot lower than min_slot
2450 */ 2494 */
2451static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root 2495static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2452 *root, struct btrfs_path *path, int data_size, 2496 *root, struct btrfs_path *path,
2453 int empty) 2497 int min_data_size, int data_size,
2498 int empty, u32 min_slot)
2454{ 2499{
2455 struct extent_buffer *left = path->nodes[0]; 2500 struct extent_buffer *left = path->nodes[0];
2456 struct extent_buffer *right; 2501 struct extent_buffer *right;
@@ -2471,6 +2516,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2471 btrfs_assert_tree_locked(path->nodes[1]); 2516 btrfs_assert_tree_locked(path->nodes[1]);
2472 2517
2473 right = read_node_slot(root, upper, slot + 1); 2518 right = read_node_slot(root, upper, slot + 1);
2519 if (right == NULL)
2520 return 1;
2521
2474 btrfs_tree_lock(right); 2522 btrfs_tree_lock(right);
2475 btrfs_set_lock_blocking(right); 2523 btrfs_set_lock_blocking(right);
2476 2524
@@ -2492,8 +2540,8 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2492 if (left_nritems == 0) 2540 if (left_nritems == 0)
2493 goto out_unlock; 2541 goto out_unlock;
2494 2542
2495 return __push_leaf_right(trans, root, path, data_size, empty, 2543 return __push_leaf_right(trans, root, path, min_data_size, empty,
2496 right, free_space, left_nritems); 2544 right, free_space, left_nritems, min_slot);
2497out_unlock: 2545out_unlock:
2498 btrfs_tree_unlock(right); 2546 btrfs_tree_unlock(right);
2499 free_extent_buffer(right); 2547 free_extent_buffer(right);
@@ -2503,16 +2551,20 @@ out_unlock:
2503/* 2551/*
2504 * push some data in the path leaf to the left, trying to free up at 2552 * push some data in the path leaf to the left, trying to free up at
2505 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2553 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2554 *
2555 * max_slot can put a limit on how far into the leaf we'll push items. The
2556 * item at 'max_slot' won't be touched. Use (u32)-1 to make us do all the
2557 * items
2506 */ 2558 */
2507static noinline int __push_leaf_left(struct btrfs_trans_handle *trans, 2559static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2508 struct btrfs_root *root, 2560 struct btrfs_root *root,
2509 struct btrfs_path *path, int data_size, 2561 struct btrfs_path *path, int data_size,
2510 int empty, struct extent_buffer *left, 2562 int empty, struct extent_buffer *left,
2511 int free_space, int right_nritems) 2563 int free_space, u32 right_nritems,
2564 u32 max_slot)
2512{ 2565{
2513 struct btrfs_disk_key disk_key; 2566 struct btrfs_disk_key disk_key;
2514 struct extent_buffer *right = path->nodes[0]; 2567 struct extent_buffer *right = path->nodes[0];
2515 int slot;
2516 int i; 2568 int i;
2517 int push_space = 0; 2569 int push_space = 0;
2518 int push_items = 0; 2570 int push_items = 0;
@@ -2524,12 +2576,10 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2524 u32 this_item_size; 2576 u32 this_item_size;
2525 u32 old_left_item_size; 2577 u32 old_left_item_size;
2526 2578
2527 slot = path->slots[1];
2528
2529 if (empty) 2579 if (empty)
2530 nr = right_nritems; 2580 nr = min(right_nritems, max_slot);
2531 else 2581 else
2532 nr = right_nritems - 1; 2582 nr = min(right_nritems - 1, max_slot);
2533 2583
2534 for (i = 0; i < nr; i++) { 2584 for (i = 0; i < nr; i++) {
2535 item = btrfs_item_nr(right, i); 2585 item = btrfs_item_nr(right, i);
@@ -2659,6 +2709,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2659 btrfs_mark_buffer_dirty(left); 2709 btrfs_mark_buffer_dirty(left);
2660 if (right_nritems) 2710 if (right_nritems)
2661 btrfs_mark_buffer_dirty(right); 2711 btrfs_mark_buffer_dirty(right);
2712 else
2713 clean_tree_block(trans, root, right);
2662 2714
2663 btrfs_item_key(right, &disk_key, 0); 2715 btrfs_item_key(right, &disk_key, 0);
2664 wret = fixup_low_keys(trans, root, path, &disk_key, 1); 2716 wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2668,8 +2720,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2668 /* then fixup the leaf pointer in the path */ 2720 /* then fixup the leaf pointer in the path */
2669 if (path->slots[0] < push_items) { 2721 if (path->slots[0] < push_items) {
2670 path->slots[0] += old_left_nritems; 2722 path->slots[0] += old_left_nritems;
2671 if (btrfs_header_nritems(path->nodes[0]) == 0)
2672 clean_tree_block(trans, root, path->nodes[0]);
2673 btrfs_tree_unlock(path->nodes[0]); 2723 btrfs_tree_unlock(path->nodes[0]);
2674 free_extent_buffer(path->nodes[0]); 2724 free_extent_buffer(path->nodes[0]);
2675 path->nodes[0] = left; 2725 path->nodes[0] = left;
@@ -2690,10 +2740,14 @@ out:
2690/* 2740/*
2691 * push some data in the path leaf to the left, trying to free up at 2741 * push some data in the path leaf to the left, trying to free up at
2692 * least data_size bytes. returns zero if the push worked, nonzero otherwise 2742 * least data_size bytes. returns zero if the push worked, nonzero otherwise
2743 *
2744 * max_slot can put a limit on how far into the leaf we'll push items. The
2745 * item at 'max_slot' won't be touched. Use (u32)-1 to make us push all the
2746 * items
2693 */ 2747 */
2694static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root 2748static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2695 *root, struct btrfs_path *path, int data_size, 2749 *root, struct btrfs_path *path, int min_data_size,
2696 int empty) 2750 int data_size, int empty, u32 max_slot)
2697{ 2751{
2698 struct extent_buffer *right = path->nodes[0]; 2752 struct extent_buffer *right = path->nodes[0];
2699 struct extent_buffer *left; 2753 struct extent_buffer *left;
@@ -2715,6 +2769,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2715 btrfs_assert_tree_locked(path->nodes[1]); 2769 btrfs_assert_tree_locked(path->nodes[1]);
2716 2770
2717 left = read_node_slot(root, path->nodes[1], slot - 1); 2771 left = read_node_slot(root, path->nodes[1], slot - 1);
2772 if (left == NULL)
2773 return 1;
2774
2718 btrfs_tree_lock(left); 2775 btrfs_tree_lock(left);
2719 btrfs_set_lock_blocking(left); 2776 btrfs_set_lock_blocking(left);
2720 2777
@@ -2739,8 +2796,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2739 goto out; 2796 goto out;
2740 } 2797 }
2741 2798
2742 return __push_leaf_left(trans, root, path, data_size, 2799 return __push_leaf_left(trans, root, path, min_data_size,
2743 empty, left, free_space, right_nritems); 2800 empty, left, free_space, right_nritems,
2801 max_slot);
2744out: 2802out:
2745 btrfs_tree_unlock(left); 2803 btrfs_tree_unlock(left);
2746 free_extent_buffer(left); 2804 free_extent_buffer(left);
@@ -2833,6 +2891,64 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
2833} 2891}
2834 2892
2835/* 2893/*
2894 * double splits happen when we need to insert a big item in the middle
2895 * of a leaf. A double split can leave us with 3 mostly empty leaves:
2896 * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
2897 * A B C
2898 *
2899 * We avoid this by trying to push the items on either side of our target
2900 * into the adjacent leaves. If all goes well we can avoid the double split
2901 * completely.
2902 */
2903static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
2904 struct btrfs_root *root,
2905 struct btrfs_path *path,
2906 int data_size)
2907{
2908 int ret;
2909 int progress = 0;
2910 int slot;
2911 u32 nritems;
2912
2913 slot = path->slots[0];
2914
2915 /*
2916 * try to push all the items after our slot into the
2917 * right leaf
2918 */
2919 ret = push_leaf_right(trans, root, path, 1, data_size, 0, slot);
2920 if (ret < 0)
2921 return ret;
2922
2923 if (ret == 0)
2924 progress++;
2925
2926 nritems = btrfs_header_nritems(path->nodes[0]);
2927 /*
2928 * our goal is to get our slot at the start or end of a leaf. If
2929 * we've done so we're done
2930 */
2931 if (path->slots[0] == 0 || path->slots[0] == nritems)
2932 return 0;
2933
2934 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
2935 return 0;
2936
2937 /* try to push all the items before our slot into the next leaf */
2938 slot = path->slots[0];
2939 ret = push_leaf_left(trans, root, path, 1, data_size, 0, slot);
2940 if (ret < 0)
2941 return ret;
2942
2943 if (ret == 0)
2944 progress++;
2945
2946 if (progress)
2947 return 0;
2948 return 1;
2949}
2950
2951/*
2836 * split the path's leaf in two, making sure there is at least data_size 2952 * split the path's leaf in two, making sure there is at least data_size
2837 * available for the resulting leaf level of the path. 2953 * available for the resulting leaf level of the path.
2838 * 2954 *
@@ -2854,6 +2970,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2854 int wret; 2970 int wret;
2855 int split; 2971 int split;
2856 int num_doubles = 0; 2972 int num_doubles = 0;
2973 int tried_avoid_double = 0;
2857 2974
2858 l = path->nodes[0]; 2975 l = path->nodes[0];
2859 slot = path->slots[0]; 2976 slot = path->slots[0];
@@ -2862,12 +2979,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
2862 return -EOVERFLOW; 2979 return -EOVERFLOW;
2863 2980
2864 /* first try to make some room by pushing left and right */ 2981 /* first try to make some room by pushing left and right */
2865 if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { 2982 if (data_size) {
2866 wret = push_leaf_right(trans, root, path, data_size, 0); 2983 wret = push_leaf_right(trans, root, path, data_size,
2984 data_size, 0, 0);
2867 if (wret < 0) 2985 if (wret < 0)
2868 return wret; 2986 return wret;
2869 if (wret) { 2987 if (wret) {
2870 wret = push_leaf_left(trans, root, path, data_size, 0); 2988 wret = push_leaf_left(trans, root, path, data_size,
2989 data_size, 0, (u32)-1);
2871 if (wret < 0) 2990 if (wret < 0)
2872 return wret; 2991 return wret;
2873 } 2992 }
@@ -2901,6 +3020,8 @@ again:
2901 if (mid != nritems && 3020 if (mid != nritems &&
2902 leaf_space_used(l, mid, nritems - mid) + 3021 leaf_space_used(l, mid, nritems - mid) +
2903 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3022 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3023 if (data_size && !tried_avoid_double)
3024 goto push_for_double;
2904 split = 2; 3025 split = 2;
2905 } 3026 }
2906 } 3027 }
@@ -2917,6 +3038,8 @@ again:
2917 if (mid != nritems && 3038 if (mid != nritems &&
2918 leaf_space_used(l, mid, nritems - mid) + 3039 leaf_space_used(l, mid, nritems - mid) +
2919 data_size > BTRFS_LEAF_DATA_SIZE(root)) { 3040 data_size > BTRFS_LEAF_DATA_SIZE(root)) {
3041 if (data_size && !tried_avoid_double)
3042 goto push_for_double;
2920 split = 2 ; 3043 split = 2 ;
2921 } 3044 }
2922 } 3045 }
@@ -2931,10 +3054,10 @@ again:
2931 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, 3054 right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
2932 root->root_key.objectid, 3055 root->root_key.objectid,
2933 &disk_key, 0, l->start, 0); 3056 &disk_key, 0, l->start, 0);
2934 if (IS_ERR(right)) { 3057 if (IS_ERR(right))
2935 BUG_ON(1);
2936 return PTR_ERR(right); 3058 return PTR_ERR(right);
2937 } 3059
3060 root_add_used(root, root->leafsize);
2938 3061
2939 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); 3062 memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
2940 btrfs_set_header_bytenr(right, right->start); 3063 btrfs_set_header_bytenr(right, right->start);
@@ -2997,6 +3120,13 @@ again:
2997 } 3120 }
2998 3121
2999 return ret; 3122 return ret;
3123
3124push_for_double:
3125 push_for_double_split(trans, root, path, data_size);
3126 tried_avoid_double = 1;
3127 if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
3128 return 0;
3129 goto again;
3000} 3130}
3001 3131
3002static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, 3132static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
@@ -3040,6 +3170,10 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3040 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) 3170 if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
3041 goto err; 3171 goto err;
3042 3172
3173 /* the leaf has changed, it now has room. return now */
3174 if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
3175 goto err;
3176
3043 if (key.type == BTRFS_EXTENT_DATA_KEY) { 3177 if (key.type == BTRFS_EXTENT_DATA_KEY) {
3044 fi = btrfs_item_ptr(leaf, path->slots[0], 3178 fi = btrfs_item_ptr(leaf, path->slots[0],
3045 struct btrfs_file_extent_item); 3179 struct btrfs_file_extent_item);
@@ -3049,7 +3183,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3049 3183
3050 btrfs_set_path_blocking(path); 3184 btrfs_set_path_blocking(path);
3051 ret = split_leaf(trans, root, &key, path, ins_len, 1); 3185 ret = split_leaf(trans, root, &key, path, ins_len, 1);
3052 BUG_ON(ret); 3186 if (ret)
3187 goto err;
3053 3188
3054 path->keep_locks = 0; 3189 path->keep_locks = 0;
3055 btrfs_unlock_up_safe(path, 1); 3190 btrfs_unlock_up_safe(path, 1);
@@ -3212,7 +3347,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3212{ 3347{
3213 int ret = 0; 3348 int ret = 0;
3214 int slot; 3349 int slot;
3215 int slot_orig;
3216 struct extent_buffer *leaf; 3350 struct extent_buffer *leaf;
3217 struct btrfs_item *item; 3351 struct btrfs_item *item;
3218 u32 nritems; 3352 u32 nritems;
@@ -3222,7 +3356,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3222 unsigned int size_diff; 3356 unsigned int size_diff;
3223 int i; 3357 int i;
3224 3358
3225 slot_orig = path->slots[0];
3226 leaf = path->nodes[0]; 3359 leaf = path->nodes[0];
3227 slot = path->slots[0]; 3360 slot = path->slots[0];
3228 3361
@@ -3327,7 +3460,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3327{ 3460{
3328 int ret = 0; 3461 int ret = 0;
3329 int slot; 3462 int slot;
3330 int slot_orig;
3331 struct extent_buffer *leaf; 3463 struct extent_buffer *leaf;
3332 struct btrfs_item *item; 3464 struct btrfs_item *item;
3333 u32 nritems; 3465 u32 nritems;
@@ -3336,7 +3468,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3336 unsigned int old_size; 3468 unsigned int old_size;
3337 int i; 3469 int i;
3338 3470
3339 slot_orig = path->slots[0];
3340 leaf = path->nodes[0]; 3471 leaf = path->nodes[0];
3341 3472
3342 nritems = btrfs_header_nritems(leaf); 3473 nritems = btrfs_header_nritems(leaf);
@@ -3669,7 +3800,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3669 struct btrfs_key *cpu_key, u32 *data_size, 3800 struct btrfs_key *cpu_key, u32 *data_size,
3670 int nr) 3801 int nr)
3671{ 3802{
3672 struct extent_buffer *leaf;
3673 int ret = 0; 3803 int ret = 0;
3674 int slot; 3804 int slot;
3675 int i; 3805 int i;
@@ -3686,7 +3816,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3686 if (ret < 0) 3816 if (ret < 0)
3687 goto out; 3817 goto out;
3688 3818
3689 leaf = path->nodes[0];
3690 slot = path->slots[0]; 3819 slot = path->slots[0];
3691 BUG_ON(slot < 0); 3820 BUG_ON(slot < 0);
3692 3821
@@ -3791,9 +3920,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
3791 */ 3920 */
3792 btrfs_unlock_up_safe(path, 0); 3921 btrfs_unlock_up_safe(path, 0);
3793 3922
3794 ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, 3923 root_sub_used(root, leaf->len);
3795 0, root->root_key.objectid, 0); 3924
3796 return ret; 3925 btrfs_free_tree_block(trans, root, leaf, 0, 1);
3926 return 0;
3797} 3927}
3798/* 3928/*
3799 * delete the item at the leaf level in path. If that empties 3929 * delete the item at the leaf level in path. If that empties
@@ -3860,6 +3990,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3860 if (leaf == root->node) { 3990 if (leaf == root->node) {
3861 btrfs_set_header_level(leaf, 0); 3991 btrfs_set_header_level(leaf, 0);
3862 } else { 3992 } else {
3993 btrfs_set_path_blocking(path);
3994 clean_tree_block(trans, root, leaf);
3863 ret = btrfs_del_leaf(trans, root, path, leaf); 3995 ret = btrfs_del_leaf(trans, root, path, leaf);
3864 BUG_ON(ret); 3996 BUG_ON(ret);
3865 } 3997 }
@@ -3885,13 +4017,15 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3885 extent_buffer_get(leaf); 4017 extent_buffer_get(leaf);
3886 4018
3887 btrfs_set_path_blocking(path); 4019 btrfs_set_path_blocking(path);
3888 wret = push_leaf_left(trans, root, path, 1, 1); 4020 wret = push_leaf_left(trans, root, path, 1, 1,
4021 1, (u32)-1);
3889 if (wret < 0 && wret != -ENOSPC) 4022 if (wret < 0 && wret != -ENOSPC)
3890 ret = wret; 4023 ret = wret;
3891 4024
3892 if (path->nodes[0] == leaf && 4025 if (path->nodes[0] == leaf &&
3893 btrfs_header_nritems(leaf)) { 4026 btrfs_header_nritems(leaf)) {
3894 wret = push_leaf_right(trans, root, path, 1, 1); 4027 wret = push_leaf_right(trans, root, path, 1,
4028 1, 1, 0);
3895 if (wret < 0 && wret != -ENOSPC) 4029 if (wret < 0 && wret != -ENOSPC)
3896 ret = wret; 4030 ret = wret;
3897 } 4031 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2aa8ec6a0981..2c98b3af6052 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -26,6 +26,8 @@
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h>
30#include <linux/kobject.h>
29#include <asm/kmap_types.h> 31#include <asm/kmap_types.h>
30#include "extent_io.h" 32#include "extent_io.h"
31#include "extent_map.h" 33#include "extent_map.h"
@@ -33,6 +35,7 @@
33 35
34struct btrfs_trans_handle; 36struct btrfs_trans_handle;
35struct btrfs_transaction; 37struct btrfs_transaction;
38struct btrfs_pending_snapshot;
36extern struct kmem_cache *btrfs_trans_handle_cachep; 39extern struct kmem_cache *btrfs_trans_handle_cachep;
37extern struct kmem_cache *btrfs_transaction_cachep; 40extern struct kmem_cache *btrfs_transaction_cachep;
38extern struct kmem_cache *btrfs_bit_radix_cachep; 41extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -97,6 +100,9 @@ struct btrfs_ordered_sum;
97 */ 100 */
98#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL 101#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
99 102
103/* For storing free space cache */
104#define BTRFS_FREE_SPACE_OBJECTID -11ULL
105
100/* dummy objectid represents multiple objectids */ 106/* dummy objectid represents multiple objectids */
101#define BTRFS_MULTIPLE_OBJECTIDS -255ULL 107#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
102 108
@@ -263,6 +269,22 @@ struct btrfs_chunk {
263 /* additional stripes go here */ 269 /* additional stripes go here */
264} __attribute__ ((__packed__)); 270} __attribute__ ((__packed__));
265 271
272#define BTRFS_FREE_SPACE_EXTENT 1
273#define BTRFS_FREE_SPACE_BITMAP 2
274
275struct btrfs_free_space_entry {
276 __le64 offset;
277 __le64 bytes;
278 u8 type;
279} __attribute__ ((__packed__));
280
281struct btrfs_free_space_header {
282 struct btrfs_disk_key location;
283 __le64 generation;
284 __le64 num_entries;
285 __le64 num_bitmaps;
286} __attribute__ ((__packed__));
287
266static inline unsigned long btrfs_chunk_item_size(int num_stripes) 288static inline unsigned long btrfs_chunk_item_size(int num_stripes)
267{ 289{
268 BUG_ON(num_stripes == 0); 290 BUG_ON(num_stripes == 0);
@@ -273,6 +295,14 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
273#define BTRFS_FSID_SIZE 16 295#define BTRFS_FSID_SIZE 16
274#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) 296#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
275#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) 297#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
298
299/*
300 * File system states
301 */
302
303/* Errors detected */
304#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
305
276#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) 306#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
277#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) 307#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
278 308
@@ -363,8 +393,10 @@ struct btrfs_super_block {
363 393
364 char label[BTRFS_LABEL_SIZE]; 394 char label[BTRFS_LABEL_SIZE];
365 395
396 __le64 cache_generation;
397
366 /* future expansion */ 398 /* future expansion */
367 __le64 reserved[32]; 399 __le64 reserved[31];
368 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 400 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
369} __attribute__ ((__packed__)); 401} __attribute__ ((__packed__));
370 402
@@ -373,11 +405,17 @@ struct btrfs_super_block {
373 * ones specified below then we will fail to mount 405 * ones specified below then we will fail to mount
374 */ 406 */
375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 407#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
408#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
409#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
410#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
376 411
377#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 412#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
378#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 413#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
379#define BTRFS_FEATURE_INCOMPAT_SUPP \ 414#define BTRFS_FEATURE_INCOMPAT_SUPP \
380 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF 415 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
416 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
417 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
418 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
381 419
382/* 420/*
383 * A leaf is full of items. offset and size tell us where to find 421 * A leaf is full of items. offset and size tell us where to find
@@ -524,9 +562,11 @@ struct btrfs_timespec {
524} __attribute__ ((__packed__)); 562} __attribute__ ((__packed__));
525 563
526enum btrfs_compression_type { 564enum btrfs_compression_type {
527 BTRFS_COMPRESS_NONE = 0, 565 BTRFS_COMPRESS_NONE = 0,
528 BTRFS_COMPRESS_ZLIB = 1, 566 BTRFS_COMPRESS_ZLIB = 1,
529 BTRFS_COMPRESS_LAST = 2, 567 BTRFS_COMPRESS_LZO = 2,
568 BTRFS_COMPRESS_TYPES = 2,
569 BTRFS_COMPRESS_LAST = 3,
530}; 570};
531 571
532struct btrfs_inode_item { 572struct btrfs_inode_item {
@@ -570,6 +610,8 @@ struct btrfs_dir_item {
570 u8 type; 610 u8 type;
571} __attribute__ ((__packed__)); 611} __attribute__ ((__packed__));
572 612
613#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
614
573struct btrfs_root_item { 615struct btrfs_root_item {
574 struct btrfs_inode_item inode; 616 struct btrfs_inode_item inode;
575 __le64 generation; 617 __le64 generation;
@@ -660,6 +702,7 @@ struct btrfs_csum_item {
660#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) 702#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4)
661#define BTRFS_BLOCK_GROUP_DUP (1 << 5) 703#define BTRFS_BLOCK_GROUP_DUP (1 << 5)
662#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) 704#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6)
705#define BTRFS_NR_RAID_TYPES 5
663 706
664struct btrfs_block_group_item { 707struct btrfs_block_group_item {
665 __le64 used; 708 __le64 used;
@@ -670,43 +713,50 @@ struct btrfs_block_group_item {
670struct btrfs_space_info { 713struct btrfs_space_info {
671 u64 flags; 714 u64 flags;
672 715
673 u64 total_bytes; /* total bytes in the space */ 716 u64 total_bytes; /* total bytes in the space,
674 u64 bytes_used; /* total bytes used on disk */ 717 this doesn't take mirrors into account */
718 u64 bytes_used; /* total bytes used,
719 this does't take mirrors into account */
675 u64 bytes_pinned; /* total bytes pinned, will be freed when the 720 u64 bytes_pinned; /* total bytes pinned, will be freed when the
676 transaction finishes */ 721 transaction finishes */
677 u64 bytes_reserved; /* total bytes the allocator has reserved for 722 u64 bytes_reserved; /* total bytes the allocator has reserved for
678 current allocations */ 723 current allocations */
679 u64 bytes_readonly; /* total bytes that are read only */ 724 u64 bytes_readonly; /* total bytes that are read only */
680 u64 bytes_super; /* total bytes reserved for the super blocks */ 725
681 u64 bytes_root; /* the number of bytes needed to commit a
682 transaction */
683 u64 bytes_may_use; /* number of bytes that may be used for 726 u64 bytes_may_use; /* number of bytes that may be used for
684 delalloc/allocations */ 727 delalloc/allocations */
685 u64 bytes_delalloc; /* number of bytes currently reserved for 728 u64 disk_used; /* total bytes used on disk */
686 delayed allocation */ 729 u64 disk_total; /* total bytes on disk, takes mirrors into
730 account */
687 731
688 int full; /* indicates that we cannot allocate any more 732 int full; /* indicates that we cannot allocate any more
689 chunks for this space */ 733 chunks for this space */
690 int force_alloc; /* set if we need to force a chunk alloc for 734 int force_alloc; /* set if we need to force a chunk alloc for
691 this space */ 735 this space */
692 int force_delalloc; /* make people start doing filemap_flush until
693 we're under a threshold */
694 736
695 struct list_head list; 737 struct list_head list;
696 738
697 /* for controlling how we free up space for allocations */
698 wait_queue_head_t allocate_wait;
699 wait_queue_head_t flush_wait;
700 int allocating_chunk;
701 int flushing;
702
703 /* for block groups in our same type */ 739 /* for block groups in our same type */
704 struct list_head block_groups; 740 struct list_head block_groups[BTRFS_NR_RAID_TYPES];
705 spinlock_t lock; 741 spinlock_t lock;
706 struct rw_semaphore groups_sem; 742 struct rw_semaphore groups_sem;
707 atomic_t caching_threads; 743 atomic_t caching_threads;
708}; 744};
709 745
746struct btrfs_block_rsv {
747 u64 size;
748 u64 reserved;
749 u64 freed[2];
750 struct btrfs_space_info *space_info;
751 struct list_head list;
752 spinlock_t lock;
753 atomic_t usage;
754 unsigned int priority:8;
755 unsigned int durable:1;
756 unsigned int refill_used:1;
757 unsigned int full:1;
758};
759
710/* 760/*
711 * free clusters are used to claim free space in relatively large chunks, 761 * free clusters are used to claim free space in relatively large chunks,
712 * allowing us to do less seeky writes. They are used for all metadata 762 * allowing us to do less seeky writes. They are used for all metadata
@@ -741,6 +791,14 @@ enum btrfs_caching_type {
741 BTRFS_CACHE_FINISHED = 2, 791 BTRFS_CACHE_FINISHED = 2,
742}; 792};
743 793
794enum btrfs_disk_cache_state {
795 BTRFS_DC_WRITTEN = 0,
796 BTRFS_DC_ERROR = 1,
797 BTRFS_DC_CLEAR = 2,
798 BTRFS_DC_SETUP = 3,
799 BTRFS_DC_NEED_WRITE = 4,
800};
801
744struct btrfs_caching_control { 802struct btrfs_caching_control {
745 struct list_head list; 803 struct list_head list;
746 struct mutex mutex; 804 struct mutex mutex;
@@ -754,17 +812,22 @@ struct btrfs_block_group_cache {
754 struct btrfs_key key; 812 struct btrfs_key key;
755 struct btrfs_block_group_item item; 813 struct btrfs_block_group_item item;
756 struct btrfs_fs_info *fs_info; 814 struct btrfs_fs_info *fs_info;
815 struct inode *inode;
757 spinlock_t lock; 816 spinlock_t lock;
758 u64 pinned; 817 u64 pinned;
759 u64 reserved; 818 u64 reserved;
819 u64 reserved_pinned;
760 u64 bytes_super; 820 u64 bytes_super;
761 u64 flags; 821 u64 flags;
762 u64 sectorsize; 822 u64 sectorsize;
763 int extents_thresh; 823 int extents_thresh;
764 int free_extents; 824 int free_extents;
765 int total_bitmaps; 825 int total_bitmaps;
766 int ro; 826 unsigned int ro:1;
767 int dirty; 827 unsigned int dirty:1;
828 unsigned int iref:1;
829
830 int disk_cache_state;
768 831
769 /* cache tracking stuff */ 832 /* cache tracking stuff */
770 int cached; 833 int cached;
@@ -822,6 +885,22 @@ struct btrfs_fs_info {
822 /* logical->physical extent mapping */ 885 /* logical->physical extent mapping */
823 struct btrfs_mapping_tree mapping_tree; 886 struct btrfs_mapping_tree mapping_tree;
824 887
888 /* block reservation for extent, checksum and root tree */
889 struct btrfs_block_rsv global_block_rsv;
890 /* block reservation for delay allocation */
891 struct btrfs_block_rsv delalloc_block_rsv;
892 /* block reservation for metadata operations */
893 struct btrfs_block_rsv trans_block_rsv;
894 /* block reservation for chunk tree */
895 struct btrfs_block_rsv chunk_block_rsv;
896
897 struct btrfs_block_rsv empty_block_rsv;
898
899 /* list of block reservations that cross multiple transactions */
900 struct list_head durable_block_rsv_list;
901
902 struct mutex durable_block_rsv_mutex;
903
825 u64 generation; 904 u64 generation;
826 u64 last_trans_committed; 905 u64 last_trans_committed;
827 906
@@ -831,13 +910,14 @@ struct btrfs_fs_info {
831 */ 910 */
832 u64 last_trans_log_full_commit; 911 u64 last_trans_log_full_commit;
833 u64 open_ioctl_trans; 912 u64 open_ioctl_trans;
834 unsigned long mount_opt; 913 unsigned long mount_opt:20;
835 u64 max_extent; 914 unsigned long compress_type:4;
836 u64 max_inline; 915 u64 max_inline;
837 u64 alloc_start; 916 u64 alloc_start;
838 struct btrfs_transaction *running_transaction; 917 struct btrfs_transaction *running_transaction;
839 wait_queue_head_t transaction_throttle; 918 wait_queue_head_t transaction_throttle;
840 wait_queue_head_t transaction_wait; 919 wait_queue_head_t transaction_wait;
920 wait_queue_head_t transaction_blocked_wait;
841 wait_queue_head_t async_submit_wait; 921 wait_queue_head_t async_submit_wait;
842 922
843 struct btrfs_super_block super_copy; 923 struct btrfs_super_block super_copy;
@@ -924,8 +1004,8 @@ struct btrfs_fs_info {
924 struct btrfs_workers endio_meta_workers; 1004 struct btrfs_workers endio_meta_workers;
925 struct btrfs_workers endio_meta_write_workers; 1005 struct btrfs_workers endio_meta_write_workers;
926 struct btrfs_workers endio_write_workers; 1006 struct btrfs_workers endio_write_workers;
1007 struct btrfs_workers endio_freespace_worker;
927 struct btrfs_workers submit_workers; 1008 struct btrfs_workers submit_workers;
928 struct btrfs_workers enospc_workers;
929 /* 1009 /*
930 * fixup workers take dirty pages that didn't properly go through 1010 * fixup workers take dirty pages that didn't properly go through
931 * the cow mechanism and make them safe to write. It happens 1011 * the cow mechanism and make them safe to write. It happens
@@ -941,6 +1021,7 @@ struct btrfs_fs_info {
941 int do_barriers; 1021 int do_barriers;
942 int closing; 1022 int closing;
943 int log_root_recovering; 1023 int log_root_recovering;
1024 int enospc_unlink;
944 1025
945 u64 total_pinned; 1026 u64 total_pinned;
946 1027
@@ -985,6 +1066,9 @@ struct btrfs_fs_info {
985 unsigned metadata_ratio; 1066 unsigned metadata_ratio;
986 1067
987 void *bdev_holder; 1068 void *bdev_holder;
1069
1070 /* filesystem state */
1071 u64 fs_state;
988}; 1072};
989 1073
990/* 1074/*
@@ -1010,6 +1094,9 @@ struct btrfs_root {
1010 struct completion kobj_unregister; 1094 struct completion kobj_unregister;
1011 struct mutex objectid_mutex; 1095 struct mutex objectid_mutex;
1012 1096
1097 spinlock_t accounting_lock;
1098 struct btrfs_block_rsv *block_rsv;
1099
1013 struct mutex log_mutex; 1100 struct mutex log_mutex;
1014 wait_queue_head_t log_writer_wait; 1101 wait_queue_head_t log_writer_wait;
1015 wait_queue_head_t log_commit_wait[2]; 1102 wait_queue_head_t log_commit_wait[2];
@@ -1041,7 +1128,6 @@ struct btrfs_root {
1041 int ref_cows; 1128 int ref_cows;
1042 int track_dirty; 1129 int track_dirty;
1043 int in_radix; 1130 int in_radix;
1044 int clean_orphans;
1045 1131
1046 u64 defrag_trans_start; 1132 u64 defrag_trans_start;
1047 struct btrfs_key defrag_progress; 1133 struct btrfs_key defrag_progress;
@@ -1055,8 +1141,11 @@ struct btrfs_root {
1055 1141
1056 struct list_head root_list; 1142 struct list_head root_list;
1057 1143
1058 spinlock_t list_lock; 1144 spinlock_t orphan_lock;
1059 struct list_head orphan_list; 1145 struct list_head orphan_list;
1146 struct btrfs_block_rsv *orphan_block_rsv;
1147 int orphan_item_inserted;
1148 int orphan_cleanup_state;
1060 1149
1061 spinlock_t inode_lock; 1150 spinlock_t inode_lock;
1062 /* red-black tree that keeps track of in-memory inodes */ 1151 /* red-black tree that keeps track of in-memory inodes */
@@ -1162,6 +1251,9 @@ struct btrfs_root {
1162#define BTRFS_MOUNT_NOSSD (1 << 9) 1251#define BTRFS_MOUNT_NOSSD (1 << 9)
1163#define BTRFS_MOUNT_DISCARD (1 << 10) 1252#define BTRFS_MOUNT_DISCARD (1 << 10)
1164#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) 1253#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
1254#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
1255#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1256#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1165 1257
1166#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1258#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1167#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1259#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1182,7 +1274,6 @@ struct btrfs_root {
1182#define BTRFS_INODE_NOATIME (1 << 9) 1274#define BTRFS_INODE_NOATIME (1 << 9)
1183#define BTRFS_INODE_DIRSYNC (1 << 10) 1275#define BTRFS_INODE_DIRSYNC (1 << 10)
1184 1276
1185
1186/* some macros to generate set/get funcs for the struct fields. This 1277/* some macros to generate set/get funcs for the struct fields. This
1187 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1278 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1188 * one for u8: 1279 * one for u8:
@@ -1636,6 +1727,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1636 write_eb_member(eb, item, struct btrfs_dir_item, location, key); 1727 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1637} 1728}
1638 1729
1730BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
1731 num_entries, 64);
1732BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
1733 num_bitmaps, 64);
1734BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
1735 generation, 64);
1736
1737static inline void btrfs_free_space_key(struct extent_buffer *eb,
1738 struct btrfs_free_space_header *h,
1739 struct btrfs_disk_key *key)
1740{
1741 read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
1742}
1743
1744static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
1745 struct btrfs_free_space_header *h,
1746 struct btrfs_disk_key *key)
1747{
1748 write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
1749}
1750
1639/* struct btrfs_disk_key */ 1751/* struct btrfs_disk_key */
1640BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, 1752BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1641 objectid, 64); 1753 objectid, 64);
@@ -1800,6 +1912,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1800BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 1912BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1801 last_snapshot, 64); 1913 last_snapshot, 64);
1802 1914
1915static inline bool btrfs_root_readonly(struct btrfs_root *root)
1916{
1917 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1918}
1919
1803/* struct btrfs_super_block */ 1920/* struct btrfs_super_block */
1804 1921
1805BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 1922BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -1842,11 +1959,13 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1842BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, 1959BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1843 compat_flags, 64); 1960 compat_flags, 64);
1844BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, 1961BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1845 compat_flags, 64); 1962 compat_ro_flags, 64);
1846BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, 1963BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1847 incompat_flags, 64); 1964 incompat_flags, 64);
1848BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1965BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1849 csum_type, 16); 1966 csum_type, 16);
1967BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
1968 cache_generation, 64);
1850 1969
1851static inline int btrfs_super_csum_size(struct btrfs_super_block *s) 1970static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1852{ 1971{
@@ -1959,11 +2078,20 @@ static inline struct dentry *fdentry(struct file *file)
1959 return file->f_path.dentry; 2078 return file->f_path.dentry;
1960} 2079}
1961 2080
2081static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2082{
2083 return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2084 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2085}
2086
1962/* extent-tree.c */ 2087/* extent-tree.c */
1963void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2088void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1964int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2089int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1965 struct btrfs_root *root, unsigned long count); 2090 struct btrfs_root *root, unsigned long count);
1966int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); 2091int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
2092int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2093 struct btrfs_root *root, u64 bytenr,
2094 u64 num_bytes, u64 *refs, u64 *flags);
1967int btrfs_pin_extent(struct btrfs_root *root, 2095int btrfs_pin_extent(struct btrfs_root *root,
1968 u64 bytenr, u64 num, int reserved); 2096 u64 bytenr, u64 num, int reserved);
1969int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, 2097int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1983,10 +2111,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
1983 u64 parent, u64 root_objectid, 2111 u64 parent, u64 root_objectid,
1984 struct btrfs_disk_key *key, int level, 2112 struct btrfs_disk_key *key, int level,
1985 u64 hint, u64 empty_size); 2113 u64 hint, u64 empty_size);
1986int btrfs_free_tree_block(struct btrfs_trans_handle *trans, 2114void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
1987 struct btrfs_root *root, 2115 struct btrfs_root *root,
1988 u64 bytenr, u32 blocksize, 2116 struct extent_buffer *buf,
1989 u64 parent, u64 root_objectid, int level); 2117 u64 parent, int last_ref);
1990struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 2118struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
1991 struct btrfs_root *root, 2119 struct btrfs_root *root,
1992 u64 bytenr, u32 blocksize, 2120 u64 bytenr, u32 blocksize,
@@ -2040,27 +2168,57 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2040 u64 size); 2168 u64 size);
2041int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2169int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2042 struct btrfs_root *root, u64 group_start); 2170 struct btrfs_root *root, u64 group_start);
2043int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
2044 struct btrfs_block_group_cache *group);
2045
2046u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2171u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2172u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2047void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2173void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2048void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2174void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2175int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2176void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2177int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2178 struct btrfs_root *root,
2179 int num_items);
2180void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2181 struct btrfs_root *root);
2182int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
2183 struct inode *inode);
2184void btrfs_orphan_release_metadata(struct inode *inode);
2185int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
2186 struct btrfs_pending_snapshot *pending);
2187int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
2188void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
2189int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
2190void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
2191void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2192struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2193void btrfs_free_block_rsv(struct btrfs_root *root,
2194 struct btrfs_block_rsv *rsv);
2195void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2196 struct btrfs_block_rsv *rsv);
2197int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2198 struct btrfs_root *root,
2199 struct btrfs_block_rsv *block_rsv,
2200 u64 num_bytes);
2201int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root,
2203 struct btrfs_block_rsv *block_rsv,
2204 u64 min_reserved, int min_factor);
2205int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2206 struct btrfs_block_rsv *dst_rsv,
2207 u64 num_bytes);
2208void btrfs_block_rsv_release(struct btrfs_root *root,
2209 struct btrfs_block_rsv *block_rsv,
2210 u64 num_bytes);
2211int btrfs_set_block_group_ro(struct btrfs_root *root,
2212 struct btrfs_block_group_cache *cache);
2213int btrfs_set_block_group_rw(struct btrfs_root *root,
2214 struct btrfs_block_group_cache *cache);
2215void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2216u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2217int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2218 u64 start, u64 end);
2219int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2220 u64 num_bytes);
2049 2221
2050int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
2051int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
2052int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2053 struct inode *inode, int num_items);
2054int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
2055 struct inode *inode, int num_items);
2056int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2057 u64 bytes);
2058void btrfs_free_reserved_data_space(struct btrfs_root *root,
2059 struct inode *inode, u64 bytes);
2060void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2061 u64 bytes);
2062void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2063 u64 bytes);
2064/* ctree.c */ 2222/* ctree.c */
2065int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2223int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2066 int level, int *slot); 2224 int level, int *slot);
@@ -2151,7 +2309,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
2151int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); 2309int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
2152int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); 2310int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
2153int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); 2311int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
2154int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref); 2312int btrfs_drop_snapshot(struct btrfs_root *root,
2313 struct btrfs_block_rsv *block_rsv, int update_ref);
2155int btrfs_drop_subtree(struct btrfs_trans_handle *trans, 2314int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2156 struct btrfs_root *root, 2315 struct btrfs_root *root,
2157 struct extent_buffer *node, 2316 struct extent_buffer *node,
@@ -2244,6 +2403,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
2244 struct btrfs_root *root, 2403 struct btrfs_root *root,
2245 const char *name, int name_len, 2404 const char *name, int name_len,
2246 u64 inode_objectid, u64 ref_objectid, u64 *index); 2405 u64 inode_objectid, u64 ref_objectid, u64 *index);
2406struct btrfs_inode_ref *
2407btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
2408 struct btrfs_root *root,
2409 struct btrfs_path *path,
2410 const char *name, int name_len,
2411 u64 inode_objectid, u64 ref_objectid, int mod);
2247int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, 2412int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
2248 struct btrfs_root *root, 2413 struct btrfs_root *root,
2249 struct btrfs_path *path, u64 objectid); 2414 struct btrfs_path *path, u64 objectid);
@@ -2256,6 +2421,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
2256 struct btrfs_root *root, u64 bytenr, u64 len); 2421 struct btrfs_root *root, u64 bytenr, u64 len);
2257int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 2422int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
2258 struct bio *bio, u32 *dst); 2423 struct bio *bio, u32 *dst);
2424int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
2425 struct bio *bio, u64 logical_offset, u32 *dst);
2259int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, 2426int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root, 2427 struct btrfs_root *root,
2261 u64 objectid, u64 pos, 2428 u64 objectid, u64 pos,
@@ -2310,7 +2477,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2310 u32 min_type); 2477 u32 min_type);
2311 2478
2312int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2479int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2313int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); 2480int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
2481 int sync);
2482int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2483 struct extent_state **cached_state);
2314int btrfs_writepages(struct address_space *mapping, 2484int btrfs_writepages(struct address_space *mapping,
2315 struct writeback_control *wbc); 2485 struct writeback_control *wbc);
2316int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2486int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2324,18 +2494,18 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
2324 pgoff_t offset, pgoff_t last_index); 2494 pgoff_t offset, pgoff_t last_index);
2325int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2495int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2326int btrfs_readpage(struct file *file, struct page *page); 2496int btrfs_readpage(struct file *file, struct page *page);
2327void btrfs_delete_inode(struct inode *inode); 2497void btrfs_evict_inode(struct inode *inode);
2328void btrfs_put_inode(struct inode *inode); 2498void btrfs_put_inode(struct inode *inode);
2329int btrfs_write_inode(struct inode *inode, int wait); 2499int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2330void btrfs_dirty_inode(struct inode *inode); 2500void btrfs_dirty_inode(struct inode *inode);
2331struct inode *btrfs_alloc_inode(struct super_block *sb); 2501struct inode *btrfs_alloc_inode(struct super_block *sb);
2332void btrfs_destroy_inode(struct inode *inode); 2502void btrfs_destroy_inode(struct inode *inode);
2333void btrfs_drop_inode(struct inode *inode); 2503int btrfs_drop_inode(struct inode *inode);
2334int btrfs_init_cachep(void); 2504int btrfs_init_cachep(void);
2335void btrfs_destroy_cachep(void); 2505void btrfs_destroy_cachep(void);
2336long btrfs_ioctl_trans_end(struct file *file); 2506long btrfs_ioctl_trans_end(struct file *file);
2337struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2507struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2338 struct btrfs_root *root); 2508 struct btrfs_root *root, int *was_new);
2339int btrfs_commit_write(struct file *file, struct page *page, 2509int btrfs_commit_write(struct file *file, struct page *page,
2340 unsigned from, unsigned to); 2510 unsigned from, unsigned to);
2341struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2511struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2347,10 +2517,24 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2347int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2517int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2348int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2518int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2349void btrfs_orphan_cleanup(struct btrfs_root *root); 2519void btrfs_orphan_cleanup(struct btrfs_root *root);
2520void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2521 struct btrfs_pending_snapshot *pending,
2522 u64 *bytes_to_reserve);
2523void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2524 struct btrfs_pending_snapshot *pending);
2525void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2526 struct btrfs_root *root);
2350int btrfs_cont_expand(struct inode *inode, loff_t size); 2527int btrfs_cont_expand(struct inode *inode, loff_t size);
2351int btrfs_invalidate_inodes(struct btrfs_root *root); 2528int btrfs_invalidate_inodes(struct btrfs_root *root);
2352void btrfs_add_delayed_iput(struct inode *inode); 2529void btrfs_add_delayed_iput(struct inode *inode);
2353void btrfs_run_delayed_iputs(struct btrfs_root *root); 2530void btrfs_run_delayed_iputs(struct btrfs_root *root);
2531int btrfs_prealloc_file_range(struct inode *inode, int mode,
2532 u64 start, u64 num_bytes, u64 min_size,
2533 loff_t actual_len, u64 *alloc_hint);
2534int btrfs_prealloc_file_range_trans(struct inode *inode,
2535 struct btrfs_trans_handle *trans, int mode,
2536 u64 start, u64 num_bytes, u64 min_size,
2537 loff_t actual_len, u64 *alloc_hint);
2354extern const struct dentry_operations btrfs_dentry_operations; 2538extern const struct dentry_operations btrfs_dentry_operations;
2355 2539
2356/* ioctl.c */ 2540/* ioctl.c */
@@ -2359,7 +2543,7 @@ void btrfs_update_iflags(struct inode *inode);
2359void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2543void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2360 2544
2361/* file.c */ 2545/* file.c */
2362int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); 2546int btrfs_sync_file(struct file *file, int datasync);
2363int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2547int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2364 int skip_pinned); 2548 int skip_pinned);
2365int btrfs_check_file(struct btrfs_root *root, struct inode *inode); 2549int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2386,13 +2570,20 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2386ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 2570ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2387 2571
2388/* super.c */ 2572/* super.c */
2389u64 btrfs_parse_size(char *str);
2390int btrfs_parse_options(struct btrfs_root *root, char *options); 2573int btrfs_parse_options(struct btrfs_root *root, char *options);
2391int btrfs_sync_fs(struct super_block *sb, int wait); 2574int btrfs_sync_fs(struct super_block *sb, int wait);
2575void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
2576 unsigned int line, int errno);
2577
2578#define btrfs_std_error(fs_info, errno) \
2579do { \
2580 if ((errno)) \
2581 __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
2582} while (0)
2392 2583
2393/* acl.c */ 2584/* acl.c */
2394#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2585#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2395int btrfs_check_acl(struct inode *inode, int mask); 2586int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
2396#else 2587#else
2397#define btrfs_check_acl NULL 2588#define btrfs_check_acl NULL
2398#endif 2589#endif
@@ -2408,4 +2599,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
2408 struct btrfs_root *root); 2599 struct btrfs_root *root);
2409int btrfs_recover_relocation(struct btrfs_root *root); 2600int btrfs_recover_relocation(struct btrfs_root *root);
2410int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); 2601int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
2602void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
2603 struct btrfs_root *root, struct extent_buffer *buf,
2604 struct extent_buffer *cow);
2605void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2606 struct btrfs_pending_snapshot *pending,
2607 u64 *bytes_to_reserve);
2608void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2609 struct btrfs_pending_snapshot *pending);
2411#endif 2610#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 84e6781413b1..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "delayed-ref.h" 23#include "delayed-ref.h"
@@ -318,107 +319,6 @@ out:
318} 319}
319 320
320/* 321/*
321 * helper function to lookup reference count and flags of extent.
322 *
323 * the head node for delayed ref is used to store the sum of all the
324 * reference count modifications queued up in the rbtree. the head
325 * node may also store the extent flags to set. This way you can check
326 * to see what the reference count and extent flags would be if all of
327 * the delayed refs are not processed.
328 */
329int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
330 struct btrfs_root *root, u64 bytenr,
331 u64 num_bytes, u64 *refs, u64 *flags)
332{
333 struct btrfs_delayed_ref_node *ref;
334 struct btrfs_delayed_ref_head *head;
335 struct btrfs_delayed_ref_root *delayed_refs;
336 struct btrfs_path *path;
337 struct btrfs_extent_item *ei;
338 struct extent_buffer *leaf;
339 struct btrfs_key key;
340 u32 item_size;
341 u64 num_refs;
342 u64 extent_flags;
343 int ret;
344
345 path = btrfs_alloc_path();
346 if (!path)
347 return -ENOMEM;
348
349 key.objectid = bytenr;
350 key.type = BTRFS_EXTENT_ITEM_KEY;
351 key.offset = num_bytes;
352 delayed_refs = &trans->transaction->delayed_refs;
353again:
354 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
355 &key, path, 0, 0);
356 if (ret < 0)
357 goto out;
358
359 if (ret == 0) {
360 leaf = path->nodes[0];
361 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
362 if (item_size >= sizeof(*ei)) {
363 ei = btrfs_item_ptr(leaf, path->slots[0],
364 struct btrfs_extent_item);
365 num_refs = btrfs_extent_refs(leaf, ei);
366 extent_flags = btrfs_extent_flags(leaf, ei);
367 } else {
368#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
369 struct btrfs_extent_item_v0 *ei0;
370 BUG_ON(item_size != sizeof(*ei0));
371 ei0 = btrfs_item_ptr(leaf, path->slots[0],
372 struct btrfs_extent_item_v0);
373 num_refs = btrfs_extent_refs_v0(leaf, ei0);
374 /* FIXME: this isn't correct for data */
375 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
376#else
377 BUG();
378#endif
379 }
380 BUG_ON(num_refs == 0);
381 } else {
382 num_refs = 0;
383 extent_flags = 0;
384 ret = 0;
385 }
386
387 spin_lock(&delayed_refs->lock);
388 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
389 if (ref) {
390 head = btrfs_delayed_node_to_head(ref);
391 if (!mutex_trylock(&head->mutex)) {
392 atomic_inc(&ref->refs);
393 spin_unlock(&delayed_refs->lock);
394
395 btrfs_release_path(root->fs_info->extent_root, path);
396
397 mutex_lock(&head->mutex);
398 mutex_unlock(&head->mutex);
399 btrfs_put_delayed_ref(ref);
400 goto again;
401 }
402 if (head->extent_op && head->extent_op->update_flags)
403 extent_flags |= head->extent_op->flags_to_set;
404 else
405 BUG_ON(num_refs == 0);
406
407 num_refs += ref->ref_mod;
408 mutex_unlock(&head->mutex);
409 }
410 WARN_ON(num_refs == 0);
411 if (refs)
412 *refs = num_refs;
413 if (flags)
414 *flags = extent_flags;
415out:
416 spin_unlock(&delayed_refs->lock);
417 btrfs_free_path(path);
418 return ret;
419}
420
421/*
422 * helper function to update an extent delayed ref in the 322 * helper function to update an extent delayed ref in the
423 * rbtree. existing and update must both have the same 323 * rbtree. existing and update must both have the same
424 * bytenr and parent 324 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr); 169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
171 struct btrfs_root *root, u64 bytenr,
172 u64 num_bytes, u64 *refs, u64 *flags);
173int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans, 170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
174 u64 bytenr, u64 num_bytes, u64 orig_parent, 171 u64 bytenr, u64 num_bytes, u64 orig_parent,
175 u64 parent, u64 orig_ref_root, u64 ref_root, 172 u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa49..f0cad5ae5be7 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -427,5 +427,5 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
427 ret = btrfs_truncate_item(trans, root, path, 427 ret = btrfs_truncate_item(trans, root, path,
428 item_len - sub_item_len, 1); 428 item_len - sub_item_len, 1);
429 } 429 }
430 return 0; 430 return ret;
431} 431}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 87b25543d7d1..b531c36455d8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -27,6 +27,8 @@
27#include <linux/kthread.h> 27#include <linux/kthread.h>
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h>
31#include <linux/migrate.h>
30#include "compat.h" 32#include "compat.h"
31#include "ctree.h" 33#include "ctree.h"
32#include "disk-io.h" 34#include "disk-io.h"
@@ -42,8 +44,20 @@
42static struct extent_io_ops btree_extent_io_ops; 44static struct extent_io_ops btree_extent_io_ops;
43static void end_workqueue_fn(struct btrfs_work *work); 45static void end_workqueue_fn(struct btrfs_work *work);
44static void free_fs_root(struct btrfs_root *root); 46static void free_fs_root(struct btrfs_root *root);
45 47static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
46static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); 48 int read_only);
49static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
50static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
51static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
52 struct btrfs_root *root);
53static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
54static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
55static int btrfs_destroy_marked_extents(struct btrfs_root *root,
56 struct extent_io_tree *dirty_pages,
57 int mark);
58static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
59 struct extent_io_tree *pinned_extents);
60static int btrfs_cleanup_transaction(struct btrfs_root *root);
47 61
48/* 62/*
49 * end_io_wq structs are used to do processing in task context when an IO is 63 * end_io_wq structs are used to do processing in task context when an IO is
@@ -75,6 +89,11 @@ struct async_submit_bio {
75 int rw; 89 int rw;
76 int mirror_num; 90 int mirror_num;
77 unsigned long bio_flags; 91 unsigned long bio_flags;
92 /*
93 * bio_offset is optional, can be used if the pages in the bio
94 * can't tell us where in the file the bio should go
95 */
96 u64 bio_offset;
78 struct btrfs_work work; 97 struct btrfs_work work;
79}; 98};
80 99
@@ -263,13 +282,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
263static int verify_parent_transid(struct extent_io_tree *io_tree, 282static int verify_parent_transid(struct extent_io_tree *io_tree,
264 struct extent_buffer *eb, u64 parent_transid) 283 struct extent_buffer *eb, u64 parent_transid)
265{ 284{
285 struct extent_state *cached_state = NULL;
266 int ret; 286 int ret;
267 287
268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 288 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
269 return 0; 289 return 0;
270 290
271 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); 291 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
272 if (extent_buffer_uptodate(io_tree, eb) && 292 0, &cached_state, GFP_NOFS);
293 if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
273 btrfs_header_generation(eb) == parent_transid) { 294 btrfs_header_generation(eb) == parent_transid) {
274 ret = 0; 295 ret = 0;
275 goto out; 296 goto out;
@@ -282,10 +303,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
282 (unsigned long long)btrfs_header_generation(eb)); 303 (unsigned long long)btrfs_header_generation(eb));
283 } 304 }
284 ret = 1; 305 ret = 1;
285 clear_extent_buffer_uptodate(io_tree, eb); 306 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
286out: 307out:
287 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 308 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
288 GFP_NOFS); 309 &cached_state, GFP_NOFS);
289 return ret; 310 return ret;
290} 311}
291 312
@@ -332,7 +353,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
332 struct extent_io_tree *tree; 353 struct extent_io_tree *tree;
333 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 354 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
334 u64 found_start; 355 u64 found_start;
335 int found_level;
336 unsigned long len; 356 unsigned long len;
337 struct extent_buffer *eb; 357 struct extent_buffer *eb;
338 int ret; 358 int ret;
@@ -347,9 +367,15 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
347 WARN_ON(len == 0); 367 WARN_ON(len == 0);
348 368
349 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 369 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
370 if (eb == NULL) {
371 WARN_ON(1);
372 goto out;
373 }
350 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 374 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
351 btrfs_header_generation(eb)); 375 btrfs_header_generation(eb));
352 BUG_ON(ret); 376 BUG_ON(ret);
377 WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
378
353 found_start = btrfs_header_bytenr(eb); 379 found_start = btrfs_header_bytenr(eb);
354 if (found_start != start) { 380 if (found_start != start) {
355 WARN_ON(1); 381 WARN_ON(1);
@@ -363,8 +389,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
363 WARN_ON(1); 389 WARN_ON(1);
364 goto err; 390 goto err;
365 } 391 }
366 found_level = btrfs_header_level(eb);
367
368 csum_tree_block(root, eb, 0); 392 csum_tree_block(root, eb, 0);
369err: 393err:
370 free_extent_buffer(eb); 394 free_extent_buffer(eb);
@@ -421,6 +445,10 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
421 WARN_ON(len == 0); 445 WARN_ON(len == 0);
422 446
423 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 447 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
448 if (eb == NULL) {
449 ret = -EIO;
450 goto out;
451 }
424 452
425 found_start = btrfs_header_bytenr(eb); 453 found_start = btrfs_header_bytenr(eb);
426 if (found_start != start) { 454 if (found_start != start) {
@@ -474,10 +502,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
474 end_io_wq->work.func = end_workqueue_fn; 502 end_io_wq->work.func = end_workqueue_fn;
475 end_io_wq->work.flags = 0; 503 end_io_wq->work.flags = 0;
476 504
477 if (bio->bi_rw & (1 << BIO_RW)) { 505 if (bio->bi_rw & REQ_WRITE) {
478 if (end_io_wq->metadata) 506 if (end_io_wq->metadata == 1)
479 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 507 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
480 &end_io_wq->work); 508 &end_io_wq->work);
509 else if (end_io_wq->metadata == 2)
510 btrfs_queue_worker(&fs_info->endio_freespace_worker,
511 &end_io_wq->work);
481 else 512 else
482 btrfs_queue_worker(&fs_info->endio_write_workers, 513 btrfs_queue_worker(&fs_info->endio_write_workers,
483 &end_io_wq->work); 514 &end_io_wq->work);
@@ -491,6 +522,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
491 } 522 }
492} 523}
493 524
525/*
526 * For the metadata arg you want
527 *
528 * 0 - if data
529 * 1 - if normal metadta
530 * 2 - if writing to the free space cache area
531 */
494int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 532int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
495 int metadata) 533 int metadata)
496{ 534{
@@ -527,13 +565,12 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
527 565
528static void run_one_async_start(struct btrfs_work *work) 566static void run_one_async_start(struct btrfs_work *work)
529{ 567{
530 struct btrfs_fs_info *fs_info;
531 struct async_submit_bio *async; 568 struct async_submit_bio *async;
532 569
533 async = container_of(work, struct async_submit_bio, work); 570 async = container_of(work, struct async_submit_bio, work);
534 fs_info = BTRFS_I(async->inode)->root->fs_info;
535 async->submit_bio_start(async->inode, async->rw, async->bio, 571 async->submit_bio_start(async->inode, async->rw, async->bio,
536 async->mirror_num, async->bio_flags); 572 async->mirror_num, async->bio_flags,
573 async->bio_offset);
537} 574}
538 575
539static void run_one_async_done(struct btrfs_work *work) 576static void run_one_async_done(struct btrfs_work *work)
@@ -555,7 +592,8 @@ static void run_one_async_done(struct btrfs_work *work)
555 wake_up(&fs_info->async_submit_wait); 592 wake_up(&fs_info->async_submit_wait);
556 593
557 async->submit_bio_done(async->inode, async->rw, async->bio, 594 async->submit_bio_done(async->inode, async->rw, async->bio,
558 async->mirror_num, async->bio_flags); 595 async->mirror_num, async->bio_flags,
596 async->bio_offset);
559} 597}
560 598
561static void run_one_async_free(struct btrfs_work *work) 599static void run_one_async_free(struct btrfs_work *work)
@@ -569,6 +607,7 @@ static void run_one_async_free(struct btrfs_work *work)
569int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 607int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
570 int rw, struct bio *bio, int mirror_num, 608 int rw, struct bio *bio, int mirror_num,
571 unsigned long bio_flags, 609 unsigned long bio_flags,
610 u64 bio_offset,
572 extent_submit_bio_hook_t *submit_bio_start, 611 extent_submit_bio_hook_t *submit_bio_start,
573 extent_submit_bio_hook_t *submit_bio_done) 612 extent_submit_bio_hook_t *submit_bio_done)
574{ 613{
@@ -591,10 +630,11 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
591 630
592 async->work.flags = 0; 631 async->work.flags = 0;
593 async->bio_flags = bio_flags; 632 async->bio_flags = bio_flags;
633 async->bio_offset = bio_offset;
594 634
595 atomic_inc(&fs_info->nr_async_submits); 635 atomic_inc(&fs_info->nr_async_submits);
596 636
597 if (rw & (1 << BIO_RW_SYNCIO)) 637 if (rw & REQ_SYNC)
598 btrfs_set_work_high_prio(&async->work); 638 btrfs_set_work_high_prio(&async->work);
599 639
600 btrfs_queue_worker(&fs_info->workers, &async->work); 640 btrfs_queue_worker(&fs_info->workers, &async->work);
@@ -626,7 +666,8 @@ static int btree_csum_one_bio(struct bio *bio)
626 666
627static int __btree_submit_bio_start(struct inode *inode, int rw, 667static int __btree_submit_bio_start(struct inode *inode, int rw,
628 struct bio *bio, int mirror_num, 668 struct bio *bio, int mirror_num,
629 unsigned long bio_flags) 669 unsigned long bio_flags,
670 u64 bio_offset)
630{ 671{
631 /* 672 /*
632 * when we're called for a write, we're already in the async 673 * when we're called for a write, we're already in the async
@@ -637,7 +678,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
637} 678}
638 679
639static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 680static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
640 int mirror_num, unsigned long bio_flags) 681 int mirror_num, unsigned long bio_flags,
682 u64 bio_offset)
641{ 683{
642 /* 684 /*
643 * when we're called for a write, we're already in the async 685 * when we're called for a write, we're already in the async
@@ -647,7 +689,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
647} 689}
648 690
649static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 691static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
650 int mirror_num, unsigned long bio_flags) 692 int mirror_num, unsigned long bio_flags,
693 u64 bio_offset)
651{ 694{
652 int ret; 695 int ret;
653 696
@@ -655,7 +698,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
655 bio, 1); 698 bio, 1);
656 BUG_ON(ret); 699 BUG_ON(ret);
657 700
658 if (!(rw & (1 << BIO_RW))) { 701 if (!(rw & REQ_WRITE)) {
659 /* 702 /*
660 * called for a read, do the setup so that checksum validation 703 * called for a read, do the setup so that checksum validation
661 * can happen in the async kernel threads 704 * can happen in the async kernel threads
@@ -670,10 +713,32 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
670 */ 713 */
671 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 714 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
672 inode, rw, bio, mirror_num, 0, 715 inode, rw, bio, mirror_num, 0,
716 bio_offset,
673 __btree_submit_bio_start, 717 __btree_submit_bio_start,
674 __btree_submit_bio_done); 718 __btree_submit_bio_done);
675} 719}
676 720
721#ifdef CONFIG_MIGRATION
722static int btree_migratepage(struct address_space *mapping,
723 struct page *newpage, struct page *page)
724{
725 /*
726 * we can't safely write a btree page from here,
727 * we haven't done the locking hook
728 */
729 if (PageDirty(page))
730 return -EAGAIN;
731 /*
732 * Buffers may be managed in a filesystem specific way.
733 * We must have no buffers or drop them.
734 */
735 if (page_has_private(page) &&
736 !try_to_release_page(page, GFP_KERNEL))
737 return -EAGAIN;
738 return migrate_page(mapping, newpage, page);
739}
740#endif
741
677static int btree_writepage(struct page *page, struct writeback_control *wbc) 742static int btree_writepage(struct page *page, struct writeback_control *wbc)
678{ 743{
679 struct extent_io_tree *tree; 744 struct extent_io_tree *tree;
@@ -688,8 +753,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
688 } 753 }
689 754
690 redirty_page_for_writepage(wbc, page); 755 redirty_page_for_writepage(wbc, page);
691 eb = btrfs_find_tree_block(root, page_offset(page), 756 eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
692 PAGE_CACHE_SIZE);
693 WARN_ON(!eb); 757 WARN_ON(!eb);
694 758
695 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 759 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -780,6 +844,9 @@ static const struct address_space_operations btree_aops = {
780 .releasepage = btree_releasepage, 844 .releasepage = btree_releasepage,
781 .invalidatepage = btree_invalidatepage, 845 .invalidatepage = btree_invalidatepage,
782 .sync_page = block_sync_page, 846 .sync_page = block_sync_page,
847#ifdef CONFIG_MIGRATION
848 .migratepage = btree_migratepage,
849#endif
783}; 850};
784 851
785int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 852int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -836,12 +903,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
836 u32 blocksize, u64 parent_transid) 903 u32 blocksize, u64 parent_transid)
837{ 904{
838 struct extent_buffer *buf = NULL; 905 struct extent_buffer *buf = NULL;
839 struct inode *btree_inode = root->fs_info->btree_inode;
840 struct extent_io_tree *io_tree;
841 int ret; 906 int ret;
842 907
843 io_tree = &BTRFS_I(btree_inode)->io_tree;
844
845 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 908 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
846 if (!buf) 909 if (!buf)
847 return NULL; 910 return NULL;
@@ -893,7 +956,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
893 root->ref_cows = 0; 956 root->ref_cows = 0;
894 root->track_dirty = 0; 957 root->track_dirty = 0;
895 root->in_radix = 0; 958 root->in_radix = 0;
896 root->clean_orphans = 0; 959 root->orphan_item_inserted = 0;
960 root->orphan_cleanup_state = 0;
897 961
898 root->fs_info = fs_info; 962 root->fs_info = fs_info;
899 root->objectid = objectid; 963 root->objectid = objectid;
@@ -901,14 +965,17 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
901 root->highest_objectid = 0; 965 root->highest_objectid = 0;
902 root->name = NULL; 966 root->name = NULL;
903 root->in_sysfs = 0; 967 root->in_sysfs = 0;
904 root->inode_tree.rb_node = NULL; 968 root->inode_tree = RB_ROOT;
969 root->block_rsv = NULL;
970 root->orphan_block_rsv = NULL;
905 971
906 INIT_LIST_HEAD(&root->dirty_list); 972 INIT_LIST_HEAD(&root->dirty_list);
907 INIT_LIST_HEAD(&root->orphan_list); 973 INIT_LIST_HEAD(&root->orphan_list);
908 INIT_LIST_HEAD(&root->root_list); 974 INIT_LIST_HEAD(&root->root_list);
909 spin_lock_init(&root->node_lock); 975 spin_lock_init(&root->node_lock);
910 spin_lock_init(&root->list_lock); 976 spin_lock_init(&root->orphan_lock);
911 spin_lock_init(&root->inode_lock); 977 spin_lock_init(&root->inode_lock);
978 spin_lock_init(&root->accounting_lock);
912 mutex_init(&root->objectid_mutex); 979 mutex_init(&root->objectid_mutex);
913 mutex_init(&root->log_mutex); 980 mutex_init(&root->log_mutex);
914 init_waitqueue_head(&root->log_writer_wait); 981 init_waitqueue_head(&root->log_writer_wait);
@@ -962,44 +1029,11 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
962 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1029 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
963 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1030 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
964 blocksize, generation); 1031 blocksize, generation);
965 BUG_ON(!root->node); 1032 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
966 root->commit_root = btrfs_root_node(root); 1033 free_extent_buffer(root->node);
967 return 0; 1034 return -EIO;
968}
969
970int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
971 struct btrfs_fs_info *fs_info)
972{
973 struct extent_buffer *eb;
974 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
975 u64 start = 0;
976 u64 end = 0;
977 int ret;
978
979 if (!log_root_tree)
980 return 0;
981
982 while (1) {
983 ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
984 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
985 if (ret)
986 break;
987
988 clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
989 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
990 } 1035 }
991 eb = fs_info->log_root_tree->node; 1036 root->commit_root = btrfs_root_node(root);
992
993 WARN_ON(btrfs_header_level(eb) != 0);
994 WARN_ON(btrfs_header_nritems(eb) != 0);
995
996 ret = btrfs_free_reserved_extent(fs_info->tree_root,
997 eb->start, eb->len);
998 BUG_ON(ret);
999
1000 free_extent_buffer(eb);
1001 kfree(fs_info->log_root_tree);
1002 fs_info->log_root_tree = NULL;
1003 return 0; 1037 return 0;
1004} 1038}
1005 1039
@@ -1133,6 +1167,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1133 } 1167 }
1134 btrfs_free_path(path); 1168 btrfs_free_path(path);
1135 if (ret) { 1169 if (ret) {
1170 kfree(root);
1136 if (ret > 0) 1171 if (ret > 0)
1137 ret = -ENOENT; 1172 ret = -ENOENT;
1138 return ERR_PTR(ret); 1173 return ERR_PTR(ret);
@@ -1190,19 +1225,23 @@ again:
1190 if (root) 1225 if (root)
1191 return root; 1226 return root;
1192 1227
1193 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1194 if (ret == 0)
1195 ret = -ENOENT;
1196 if (ret < 0)
1197 return ERR_PTR(ret);
1198
1199 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); 1228 root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
1200 if (IS_ERR(root)) 1229 if (IS_ERR(root))
1201 return root; 1230 return root;
1202 1231
1203 WARN_ON(btrfs_root_refs(&root->root_item) == 0);
1204 set_anon_super(&root->anon_super, NULL); 1232 set_anon_super(&root->anon_super, NULL);
1205 1233
1234 if (btrfs_root_refs(&root->root_item) == 0) {
1235 ret = -ENOENT;
1236 goto fail;
1237 }
1238
1239 ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
1240 if (ret < 0)
1241 goto fail;
1242 if (ret == 0)
1243 root->orphan_item_inserted = 1;
1244
1206 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); 1245 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
1207 if (ret) 1246 if (ret)
1208 goto fail; 1247 goto fail;
@@ -1211,10 +1250,9 @@ again:
1211 ret = radix_tree_insert(&fs_info->fs_roots_radix, 1250 ret = radix_tree_insert(&fs_info->fs_roots_radix,
1212 (unsigned long)root->root_key.objectid, 1251 (unsigned long)root->root_key.objectid,
1213 root); 1252 root);
1214 if (ret == 0) { 1253 if (ret == 0)
1215 root->in_radix = 1; 1254 root->in_radix = 1;
1216 root->clean_orphans = 1; 1255
1217 }
1218 spin_unlock(&fs_info->fs_roots_radix_lock); 1256 spin_unlock(&fs_info->fs_roots_radix_lock);
1219 radix_tree_preload_end(); 1257 radix_tree_preload_end();
1220 if (ret) { 1258 if (ret) {
@@ -1372,19 +1410,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1372{ 1410{
1373 int err; 1411 int err;
1374 1412
1375 bdi->name = "btrfs";
1376 bdi->capabilities = BDI_CAP_MAP_COPY; 1413 bdi->capabilities = BDI_CAP_MAP_COPY;
1377 err = bdi_init(bdi); 1414 err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
1378 if (err) 1415 if (err)
1379 return err; 1416 return err;
1380 1417
1381 err = bdi_register(bdi, NULL, "btrfs-%d",
1382 atomic_inc_return(&btrfs_bdi_num));
1383 if (err) {
1384 bdi_destroy(bdi);
1385 return err;
1386 }
1387
1388 bdi->ra_pages = default_backing_dev_info.ra_pages; 1418 bdi->ra_pages = default_backing_dev_info.ra_pages;
1389 bdi->unplug_io_fn = btrfs_unplug_io_fn; 1419 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1390 bdi->unplug_io_data = info; 1420 bdi->unplug_io_data = info;
@@ -1400,7 +1430,6 @@ static int bio_ready_for_csum(struct bio *bio)
1400 u64 start = 0; 1430 u64 start = 0;
1401 struct page *page; 1431 struct page *page;
1402 struct extent_io_tree *io_tree = NULL; 1432 struct extent_io_tree *io_tree = NULL;
1403 struct btrfs_fs_info *info = NULL;
1404 struct bio_vec *bvec; 1433 struct bio_vec *bvec;
1405 int i; 1434 int i;
1406 int ret; 1435 int ret;
@@ -1419,7 +1448,6 @@ static int bio_ready_for_csum(struct bio *bio)
1419 buf_len = page->private >> 2; 1448 buf_len = page->private >> 2;
1420 start = page_offset(page) + bvec->bv_offset; 1449 start = page_offset(page) + bvec->bv_offset;
1421 io_tree = &BTRFS_I(page->mapping->host)->io_tree; 1450 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1422 info = BTRFS_I(page->mapping->host)->root->fs_info;
1423 } 1451 }
1424 /* are we fully contained in this bio? */ 1452 /* are we fully contained in this bio? */
1425 if (buf_len <= length) 1453 if (buf_len <= length)
@@ -1450,7 +1478,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
1450 * ram and up to date before trying to verify things. For 1478 * ram and up to date before trying to verify things. For
1451 * blocksize <= pagesize, it is basically a noop 1479 * blocksize <= pagesize, it is basically a noop
1452 */ 1480 */
1453 if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && 1481 if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
1454 !bio_ready_for_csum(bio)) { 1482 !bio_ready_for_csum(bio)) {
1455 btrfs_queue_worker(&fs_info->endio_meta_workers, 1483 btrfs_queue_worker(&fs_info->endio_meta_workers,
1456 &end_io_wq->work); 1484 &end_io_wq->work);
@@ -1468,10 +1496,6 @@ static int cleaner_kthread(void *arg)
1468 struct btrfs_root *root = arg; 1496 struct btrfs_root *root = arg;
1469 1497
1470 do { 1498 do {
1471 smp_mb();
1472 if (root->fs_info->closing)
1473 break;
1474
1475 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1499 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1476 1500
1477 if (!(root->fs_info->sb->s_flags & MS_RDONLY) && 1501 if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1484,11 +1508,9 @@ static int cleaner_kthread(void *arg)
1484 if (freezing(current)) { 1508 if (freezing(current)) {
1485 refrigerator(); 1509 refrigerator();
1486 } else { 1510 } else {
1487 smp_mb();
1488 if (root->fs_info->closing)
1489 break;
1490 set_current_state(TASK_INTERRUPTIBLE); 1511 set_current_state(TASK_INTERRUPTIBLE);
1491 schedule(); 1512 if (!kthread_should_stop())
1513 schedule();
1492 __set_current_state(TASK_RUNNING); 1514 __set_current_state(TASK_RUNNING);
1493 } 1515 }
1494 } while (!kthread_should_stop()); 1516 } while (!kthread_should_stop());
@@ -1500,36 +1522,40 @@ static int transaction_kthread(void *arg)
1500 struct btrfs_root *root = arg; 1522 struct btrfs_root *root = arg;
1501 struct btrfs_trans_handle *trans; 1523 struct btrfs_trans_handle *trans;
1502 struct btrfs_transaction *cur; 1524 struct btrfs_transaction *cur;
1525 u64 transid;
1503 unsigned long now; 1526 unsigned long now;
1504 unsigned long delay; 1527 unsigned long delay;
1505 int ret; 1528 int ret;
1506 1529
1507 do { 1530 do {
1508 smp_mb();
1509 if (root->fs_info->closing)
1510 break;
1511
1512 delay = HZ * 30; 1531 delay = HZ * 30;
1513 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1532 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1514 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1533 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1515 1534
1516 mutex_lock(&root->fs_info->trans_mutex); 1535 spin_lock(&root->fs_info->new_trans_lock);
1517 cur = root->fs_info->running_transaction; 1536 cur = root->fs_info->running_transaction;
1518 if (!cur) { 1537 if (!cur) {
1519 mutex_unlock(&root->fs_info->trans_mutex); 1538 spin_unlock(&root->fs_info->new_trans_lock);
1520 goto sleep; 1539 goto sleep;
1521 } 1540 }
1522 1541
1523 now = get_seconds(); 1542 now = get_seconds();
1524 if (now < cur->start_time || now - cur->start_time < 30) { 1543 if (!cur->blocked &&
1525 mutex_unlock(&root->fs_info->trans_mutex); 1544 (now < cur->start_time || now - cur->start_time < 30)) {
1545 spin_unlock(&root->fs_info->new_trans_lock);
1526 delay = HZ * 5; 1546 delay = HZ * 5;
1527 goto sleep; 1547 goto sleep;
1528 } 1548 }
1529 mutex_unlock(&root->fs_info->trans_mutex); 1549 transid = cur->transid;
1530 trans = btrfs_start_transaction(root, 1); 1550 spin_unlock(&root->fs_info->new_trans_lock);
1531 ret = btrfs_commit_transaction(trans, root);
1532 1551
1552 trans = btrfs_join_transaction(root, 1);
1553 if (transid == trans->transid) {
1554 ret = btrfs_commit_transaction(trans, root);
1555 BUG_ON(ret);
1556 } else {
1557 btrfs_end_transaction(trans, root);
1558 }
1533sleep: 1559sleep:
1534 wake_up_process(root->fs_info->cleaner_kthread); 1560 wake_up_process(root->fs_info->cleaner_kthread);
1535 mutex_unlock(&root->fs_info->transaction_kthread_mutex); 1561 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1537,10 +1563,10 @@ sleep:
1537 if (freezing(current)) { 1563 if (freezing(current)) {
1538 refrigerator(); 1564 refrigerator();
1539 } else { 1565 } else {
1540 if (root->fs_info->closing)
1541 break;
1542 set_current_state(TASK_INTERRUPTIBLE); 1566 set_current_state(TASK_INTERRUPTIBLE);
1543 schedule_timeout(delay); 1567 if (!kthread_should_stop() &&
1568 !btrfs_transaction_blocked(root->fs_info))
1569 schedule_timeout(delay);
1544 __set_current_state(TASK_RUNNING); 1570 __set_current_state(TASK_RUNNING);
1545 } 1571 }
1546 } while (!kthread_should_stop()); 1572 } while (!kthread_should_stop());
@@ -1564,10 +1590,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1564 GFP_NOFS); 1590 GFP_NOFS);
1565 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), 1591 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1566 GFP_NOFS); 1592 GFP_NOFS);
1567 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), 1593 struct btrfs_root *tree_root = btrfs_sb(sb);
1568 GFP_NOFS); 1594 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1569 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1570 GFP_NOFS);
1571 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1595 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1572 GFP_NOFS); 1596 GFP_NOFS);
1573 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1597 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1627,12 +1651,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1627 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1651 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1628 INIT_LIST_HEAD(&fs_info->space_info); 1652 INIT_LIST_HEAD(&fs_info->space_info);
1629 btrfs_mapping_init(&fs_info->mapping_tree); 1653 btrfs_mapping_init(&fs_info->mapping_tree);
1654 btrfs_init_block_rsv(&fs_info->global_block_rsv);
1655 btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
1656 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1657 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1658 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1659 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
1660 mutex_init(&fs_info->durable_block_rsv_mutex);
1630 atomic_set(&fs_info->nr_async_submits, 0); 1661 atomic_set(&fs_info->nr_async_submits, 0);
1631 atomic_set(&fs_info->async_delalloc_pages, 0); 1662 atomic_set(&fs_info->async_delalloc_pages, 0);
1632 atomic_set(&fs_info->async_submit_draining, 0); 1663 atomic_set(&fs_info->async_submit_draining, 0);
1633 atomic_set(&fs_info->nr_async_bios, 0); 1664 atomic_set(&fs_info->nr_async_bios, 0);
1634 fs_info->sb = sb; 1665 fs_info->sb = sb;
1635 fs_info->max_extent = (u64)-1;
1636 fs_info->max_inline = 8192 * 1024; 1666 fs_info->max_inline = 8192 * 1024;
1637 fs_info->metadata_ratio = 0; 1667 fs_info->metadata_ratio = 0;
1638 1668
@@ -1673,7 +1703,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1673 insert_inode_hash(fs_info->btree_inode); 1703 insert_inode_hash(fs_info->btree_inode);
1674 1704
1675 spin_lock_init(&fs_info->block_group_cache_lock); 1705 spin_lock_init(&fs_info->block_group_cache_lock);
1676 fs_info->block_group_cache_tree.rb_node = NULL; 1706 fs_info->block_group_cache_tree = RB_ROOT;
1677 1707
1678 extent_io_tree_init(&fs_info->freed_extents[0], 1708 extent_io_tree_init(&fs_info->freed_extents[0],
1679 fs_info->btree_inode->i_mapping, GFP_NOFS); 1709 fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1699,15 +1729,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1699 1729
1700 init_waitqueue_head(&fs_info->transaction_throttle); 1730 init_waitqueue_head(&fs_info->transaction_throttle);
1701 init_waitqueue_head(&fs_info->transaction_wait); 1731 init_waitqueue_head(&fs_info->transaction_wait);
1732 init_waitqueue_head(&fs_info->transaction_blocked_wait);
1702 init_waitqueue_head(&fs_info->async_submit_wait); 1733 init_waitqueue_head(&fs_info->async_submit_wait);
1703 1734
1704 __setup_root(4096, 4096, 4096, 4096, tree_root, 1735 __setup_root(4096, 4096, 4096, 4096, tree_root,
1705 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1736 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1706 1737
1707
1708 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1738 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1709 if (!bh) 1739 if (!bh) {
1740 err = -EINVAL;
1710 goto fail_iput; 1741 goto fail_iput;
1742 }
1711 1743
1712 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1744 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1713 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1745 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1720,6 +1752,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1720 if (!btrfs_super_root(disk_super)) 1752 if (!btrfs_super_root(disk_super))
1721 goto fail_iput; 1753 goto fail_iput;
1722 1754
1755 /* check FS state, whether FS is broken. */
1756 fs_info->fs_state |= btrfs_super_flags(disk_super);
1757
1758 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1759
1723 ret = btrfs_parse_options(tree_root, options); 1760 ret = btrfs_parse_options(tree_root, options);
1724 if (ret) { 1761 if (ret) {
1725 err = ret; 1762 err = ret;
@@ -1737,10 +1774,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1737 } 1774 }
1738 1775
1739 features = btrfs_super_incompat_flags(disk_super); 1776 features = btrfs_super_incompat_flags(disk_super);
1740 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1777 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1741 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1778 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1742 btrfs_set_super_incompat_flags(disk_super, features); 1779 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1743 } 1780 btrfs_set_super_incompat_flags(disk_super, features);
1744 1781
1745 features = btrfs_super_compat_ro_flags(disk_super) & 1782 features = btrfs_super_compat_ro_flags(disk_super) &
1746 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1783 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1767,9 +1804,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1767 min_t(u64, fs_devices->num_devices, 1804 min_t(u64, fs_devices->num_devices,
1768 fs_info->thread_pool_size), 1805 fs_info->thread_pool_size),
1769 &fs_info->generic_worker); 1806 &fs_info->generic_worker);
1770 btrfs_init_workers(&fs_info->enospc_workers, "enospc",
1771 fs_info->thread_pool_size,
1772 &fs_info->generic_worker);
1773 1807
1774 /* a higher idle thresh on the submit workers makes it much more 1808 /* a higher idle thresh on the submit workers makes it much more
1775 * likely that bios will be send down in a sane order to the 1809 * likely that bios will be send down in a sane order to the
@@ -1797,6 +1831,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1797 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1831 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1798 fs_info->thread_pool_size, 1832 fs_info->thread_pool_size,
1799 &fs_info->generic_worker); 1833 &fs_info->generic_worker);
1834 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1835 1, &fs_info->generic_worker);
1800 1836
1801 /* 1837 /*
1802 * endios are largely parallel and should have a very 1838 * endios are largely parallel and should have a very
@@ -1817,7 +1853,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1817 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1853 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1818 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1854 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1819 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1855 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1820 btrfs_start_workers(&fs_info->enospc_workers, 1); 1856 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1821 1857
1822 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1858 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1823 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1859 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1920,17 +1956,22 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1920 1956
1921 csum_root->track_dirty = 1; 1957 csum_root->track_dirty = 1;
1922 1958
1923 btrfs_read_block_groups(extent_root);
1924
1925 fs_info->generation = generation; 1959 fs_info->generation = generation;
1926 fs_info->last_trans_committed = generation; 1960 fs_info->last_trans_committed = generation;
1927 fs_info->data_alloc_profile = (u64)-1; 1961 fs_info->data_alloc_profile = (u64)-1;
1928 fs_info->metadata_alloc_profile = (u64)-1; 1962 fs_info->metadata_alloc_profile = (u64)-1;
1929 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1963 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1964
1965 ret = btrfs_read_block_groups(extent_root);
1966 if (ret) {
1967 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
1968 goto fail_block_groups;
1969 }
1970
1930 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, 1971 fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
1931 "btrfs-cleaner"); 1972 "btrfs-cleaner");
1932 if (IS_ERR(fs_info->cleaner_kthread)) 1973 if (IS_ERR(fs_info->cleaner_kthread))
1933 goto fail_csum_root; 1974 goto fail_block_groups;
1934 1975
1935 fs_info->transaction_kthread = kthread_run(transaction_kthread, 1976 fs_info->transaction_kthread = kthread_run(transaction_kthread,
1936 tree_root, 1977 tree_root,
@@ -1946,7 +1987,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1946 btrfs_set_opt(fs_info->mount_opt, SSD); 1987 btrfs_set_opt(fs_info->mount_opt, SSD);
1947 } 1988 }
1948 1989
1949 if (btrfs_super_log_root(disk_super) != 0) { 1990 /* do not make disk changes in broken FS */
1991 if (btrfs_super_log_root(disk_super) != 0 &&
1992 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1950 u64 bytenr = btrfs_super_log_root(disk_super); 1993 u64 bytenr = btrfs_super_log_root(disk_super);
1951 1994
1952 if (fs_devices->rw_devices == 0) { 1995 if (fs_devices->rw_devices == 0) {
@@ -1959,8 +2002,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1959 btrfs_level_size(tree_root, 2002 btrfs_level_size(tree_root,
1960 btrfs_super_log_root_level(disk_super)); 2003 btrfs_super_log_root_level(disk_super));
1961 2004
1962 log_tree_root = kzalloc(sizeof(struct btrfs_root), 2005 log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1963 GFP_NOFS); 2006 if (!log_tree_root) {
2007 err = -ENOMEM;
2008 goto fail_trans_kthread;
2009 }
1964 2010
1965 __setup_root(nodesize, leafsize, sectorsize, stripesize, 2011 __setup_root(nodesize, leafsize, sectorsize, stripesize,
1966 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); 2012 log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1981,8 +2027,16 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1981 BUG_ON(ret); 2027 BUG_ON(ret);
1982 2028
1983 if (!(sb->s_flags & MS_RDONLY)) { 2029 if (!(sb->s_flags & MS_RDONLY)) {
1984 ret = btrfs_recover_relocation(tree_root); 2030 ret = btrfs_cleanup_fs_roots(fs_info);
1985 BUG_ON(ret); 2031 BUG_ON(ret);
2032
2033 ret = btrfs_recover_relocation(tree_root);
2034 if (ret < 0) {
2035 printk(KERN_WARNING
2036 "btrfs: failed to recover relocation\n");
2037 err = -EINVAL;
2038 goto fail_trans_kthread;
2039 }
1986 } 2040 }
1987 2041
1988 location.objectid = BTRFS_FS_TREE_OBJECTID; 2042 location.objectid = BTRFS_FS_TREE_OBJECTID;
@@ -1992,10 +2046,15 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1992 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); 2046 fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
1993 if (!fs_info->fs_root) 2047 if (!fs_info->fs_root)
1994 goto fail_trans_kthread; 2048 goto fail_trans_kthread;
2049 if (IS_ERR(fs_info->fs_root)) {
2050 err = PTR_ERR(fs_info->fs_root);
2051 goto fail_trans_kthread;
2052 }
1995 2053
1996 if (!(sb->s_flags & MS_RDONLY)) { 2054 if (!(sb->s_flags & MS_RDONLY)) {
1997 down_read(&fs_info->cleanup_work_sem); 2055 down_read(&fs_info->cleanup_work_sem);
1998 btrfs_orphan_cleanup(fs_info->fs_root); 2056 btrfs_orphan_cleanup(fs_info->fs_root);
2057 btrfs_orphan_cleanup(fs_info->tree_root);
1999 up_read(&fs_info->cleanup_work_sem); 2058 up_read(&fs_info->cleanup_work_sem);
2000 } 2059 }
2001 2060
@@ -2013,7 +2072,8 @@ fail_cleaner:
2013 filemap_write_and_wait(fs_info->btree_inode->i_mapping); 2072 filemap_write_and_wait(fs_info->btree_inode->i_mapping);
2014 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2073 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2015 2074
2016fail_csum_root: 2075fail_block_groups:
2076 btrfs_free_block_groups(fs_info);
2017 free_extent_buffer(csum_root->node); 2077 free_extent_buffer(csum_root->node);
2018 free_extent_buffer(csum_root->commit_root); 2078 free_extent_buffer(csum_root->commit_root);
2019fail_dev_root: 2079fail_dev_root:
@@ -2037,8 +2097,8 @@ fail_sb_buffer:
2037 btrfs_stop_workers(&fs_info->endio_meta_workers); 2097 btrfs_stop_workers(&fs_info->endio_meta_workers);
2038 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2098 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2039 btrfs_stop_workers(&fs_info->endio_write_workers); 2099 btrfs_stop_workers(&fs_info->endio_write_workers);
2100 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2040 btrfs_stop_workers(&fs_info->submit_workers); 2101 btrfs_stop_workers(&fs_info->submit_workers);
2041 btrfs_stop_workers(&fs_info->enospc_workers);
2042fail_iput: 2102fail_iput:
2043 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2103 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2044 iput(fs_info->btree_inode); 2104 iput(fs_info->btree_inode);
@@ -2066,7 +2126,7 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2066 if (uptodate) { 2126 if (uptodate) {
2067 set_buffer_uptodate(bh); 2127 set_buffer_uptodate(bh);
2068 } else { 2128 } else {
2069 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2129 if (printk_ratelimit()) {
2070 printk(KERN_WARNING "lost page write due to " 2130 printk(KERN_WARNING "lost page write due to "
2071 "I/O error on %s\n", 2131 "I/O error on %s\n",
2072 bdevname(bh->b_bdev, b)); 2132 bdevname(bh->b_bdev, b));
@@ -2203,21 +2263,10 @@ static int write_dev_supers(struct btrfs_device *device,
2203 bh->b_end_io = btrfs_end_buffer_write_sync; 2263 bh->b_end_io = btrfs_end_buffer_write_sync;
2204 } 2264 }
2205 2265
2206 if (i == last_barrier && do_barriers && device->barriers) { 2266 if (i == last_barrier && do_barriers)
2207 ret = submit_bh(WRITE_BARRIER, bh); 2267 ret = submit_bh(WRITE_FLUSH_FUA, bh);
2208 if (ret == -EOPNOTSUPP) { 2268 else
2209 printk("btrfs: disabling barriers on dev %s\n",
2210 device->name);
2211 set_buffer_uptodate(bh);
2212 device->barriers = 0;
2213 /* one reference for submit_bh */
2214 get_bh(bh);
2215 lock_buffer(bh);
2216 ret = submit_bh(WRITE_SYNC, bh);
2217 }
2218 } else {
2219 ret = submit_bh(WRITE_SYNC, bh); 2269 ret = submit_bh(WRITE_SYNC, bh);
2220 }
2221 2270
2222 if (ret) 2271 if (ret)
2223 errors++; 2272 errors++;
@@ -2403,11 +2452,11 @@ int btrfs_commit_super(struct btrfs_root *root)
2403 down_write(&root->fs_info->cleanup_work_sem); 2452 down_write(&root->fs_info->cleanup_work_sem);
2404 up_write(&root->fs_info->cleanup_work_sem); 2453 up_write(&root->fs_info->cleanup_work_sem);
2405 2454
2406 trans = btrfs_start_transaction(root, 1); 2455 trans = btrfs_join_transaction(root, 1);
2407 ret = btrfs_commit_transaction(trans, root); 2456 ret = btrfs_commit_transaction(trans, root);
2408 BUG_ON(ret); 2457 BUG_ON(ret);
2409 /* run commit again to drop the original snapshot */ 2458 /* run commit again to drop the original snapshot */
2410 trans = btrfs_start_transaction(root, 1); 2459 trans = btrfs_join_transaction(root, 1);
2411 btrfs_commit_transaction(trans, root); 2460 btrfs_commit_transaction(trans, root);
2412 ret = btrfs_write_and_wait_transaction(NULL, root); 2461 ret = btrfs_write_and_wait_transaction(NULL, root);
2413 BUG_ON(ret); 2462 BUG_ON(ret);
@@ -2424,15 +2473,36 @@ int close_ctree(struct btrfs_root *root)
2424 fs_info->closing = 1; 2473 fs_info->closing = 1;
2425 smp_mb(); 2474 smp_mb();
2426 2475
2427 kthread_stop(root->fs_info->transaction_kthread); 2476 btrfs_put_block_group_cache(fs_info);
2428 kthread_stop(root->fs_info->cleaner_kthread);
2429 2477
2478 /*
2479 * Here come 2 situations when btrfs is broken to flip readonly:
2480 *
2481 * 1. when btrfs flips readonly somewhere else before
2482 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2483 * and btrfs will skip to write sb directly to keep
2484 * ERROR state on disk.
2485 *
2486 * 2. when btrfs flips readonly just in btrfs_commit_super,
2487 * and in such case, btrfs cannnot write sb via btrfs_commit_super,
2488 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2489 * btrfs will cleanup all FS resources first and write sb then.
2490 */
2430 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2491 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2431 ret = btrfs_commit_super(root); 2492 ret = btrfs_commit_super(root);
2432 if (ret) 2493 if (ret)
2433 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2494 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2434 } 2495 }
2435 2496
2497 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2498 ret = btrfs_error_commit_super(root);
2499 if (ret)
2500 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2501 }
2502
2503 kthread_stop(root->fs_info->transaction_kthread);
2504 kthread_stop(root->fs_info->cleaner_kthread);
2505
2436 fs_info->closing = 2; 2506 fs_info->closing = 2;
2437 smp_mb(); 2507 smp_mb();
2438 2508
@@ -2470,8 +2540,8 @@ int close_ctree(struct btrfs_root *root)
2470 btrfs_stop_workers(&fs_info->endio_meta_workers); 2540 btrfs_stop_workers(&fs_info->endio_meta_workers);
2471 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2541 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2472 btrfs_stop_workers(&fs_info->endio_write_workers); 2542 btrfs_stop_workers(&fs_info->endio_write_workers);
2543 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2473 btrfs_stop_workers(&fs_info->submit_workers); 2544 btrfs_stop_workers(&fs_info->submit_workers);
2474 btrfs_stop_workers(&fs_info->enospc_workers);
2475 2545
2476 btrfs_close_devices(fs_info->fs_devices); 2546 btrfs_close_devices(fs_info->fs_devices);
2477 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2547 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2492,7 +2562,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2492 int ret; 2562 int ret;
2493 struct inode *btree_inode = buf->first_page->mapping->host; 2563 struct inode *btree_inode = buf->first_page->mapping->host;
2494 2564
2495 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); 2565 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
2566 NULL);
2496 if (!ret) 2567 if (!ret)
2497 return ret; 2568 return ret;
2498 2569
@@ -2600,6 +2671,352 @@ out:
2600 return 0; 2671 return 0;
2601} 2672}
2602 2673
2674static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2675 int read_only)
2676{
2677 if (read_only)
2678 return;
2679
2680 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2681 printk(KERN_WARNING "warning: mount fs with errors, "
2682 "running btrfsck is recommended\n");
2683}
2684
2685int btrfs_error_commit_super(struct btrfs_root *root)
2686{
2687 int ret;
2688
2689 mutex_lock(&root->fs_info->cleaner_mutex);
2690 btrfs_run_delayed_iputs(root);
2691 mutex_unlock(&root->fs_info->cleaner_mutex);
2692
2693 down_write(&root->fs_info->cleanup_work_sem);
2694 up_write(&root->fs_info->cleanup_work_sem);
2695
2696 /* cleanup FS via transaction */
2697 btrfs_cleanup_transaction(root);
2698
2699 ret = write_ctree_super(NULL, root, 0);
2700
2701 return ret;
2702}
2703
2704static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2705{
2706 struct btrfs_inode *btrfs_inode;
2707 struct list_head splice;
2708
2709 INIT_LIST_HEAD(&splice);
2710
2711 mutex_lock(&root->fs_info->ordered_operations_mutex);
2712 spin_lock(&root->fs_info->ordered_extent_lock);
2713
2714 list_splice_init(&root->fs_info->ordered_operations, &splice);
2715 while (!list_empty(&splice)) {
2716 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2717 ordered_operations);
2718
2719 list_del_init(&btrfs_inode->ordered_operations);
2720
2721 btrfs_invalidate_inodes(btrfs_inode->root);
2722 }
2723
2724 spin_unlock(&root->fs_info->ordered_extent_lock);
2725 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2726
2727 return 0;
2728}
2729
2730static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2731{
2732 struct list_head splice;
2733 struct btrfs_ordered_extent *ordered;
2734 struct inode *inode;
2735
2736 INIT_LIST_HEAD(&splice);
2737
2738 spin_lock(&root->fs_info->ordered_extent_lock);
2739
2740 list_splice_init(&root->fs_info->ordered_extents, &splice);
2741 while (!list_empty(&splice)) {
2742 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2743 root_extent_list);
2744
2745 list_del_init(&ordered->root_extent_list);
2746 atomic_inc(&ordered->refs);
2747
2748 /* the inode may be getting freed (in sys_unlink path). */
2749 inode = igrab(ordered->inode);
2750
2751 spin_unlock(&root->fs_info->ordered_extent_lock);
2752 if (inode)
2753 iput(inode);
2754
2755 atomic_set(&ordered->refs, 1);
2756 btrfs_put_ordered_extent(ordered);
2757
2758 spin_lock(&root->fs_info->ordered_extent_lock);
2759 }
2760
2761 spin_unlock(&root->fs_info->ordered_extent_lock);
2762
2763 return 0;
2764}
2765
2766static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2767 struct btrfs_root *root)
2768{
2769 struct rb_node *node;
2770 struct btrfs_delayed_ref_root *delayed_refs;
2771 struct btrfs_delayed_ref_node *ref;
2772 int ret = 0;
2773
2774 delayed_refs = &trans->delayed_refs;
2775
2776 spin_lock(&delayed_refs->lock);
2777 if (delayed_refs->num_entries == 0) {
2778 printk(KERN_INFO "delayed_refs has NO entry\n");
2779 return ret;
2780 }
2781
2782 node = rb_first(&delayed_refs->root);
2783 while (node) {
2784 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2785 node = rb_next(node);
2786
2787 ref->in_tree = 0;
2788 rb_erase(&ref->rb_node, &delayed_refs->root);
2789 delayed_refs->num_entries--;
2790
2791 atomic_set(&ref->refs, 1);
2792 if (btrfs_delayed_ref_is_head(ref)) {
2793 struct btrfs_delayed_ref_head *head;
2794
2795 head = btrfs_delayed_node_to_head(ref);
2796 mutex_lock(&head->mutex);
2797 kfree(head->extent_op);
2798 delayed_refs->num_heads--;
2799 if (list_empty(&head->cluster))
2800 delayed_refs->num_heads_ready--;
2801 list_del_init(&head->cluster);
2802 mutex_unlock(&head->mutex);
2803 }
2804
2805 spin_unlock(&delayed_refs->lock);
2806 btrfs_put_delayed_ref(ref);
2807
2808 cond_resched();
2809 spin_lock(&delayed_refs->lock);
2810 }
2811
2812 spin_unlock(&delayed_refs->lock);
2813
2814 return ret;
2815}
2816
2817static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2818{
2819 struct btrfs_pending_snapshot *snapshot;
2820 struct list_head splice;
2821
2822 INIT_LIST_HEAD(&splice);
2823
2824 list_splice_init(&t->pending_snapshots, &splice);
2825
2826 while (!list_empty(&splice)) {
2827 snapshot = list_entry(splice.next,
2828 struct btrfs_pending_snapshot,
2829 list);
2830
2831 list_del_init(&snapshot->list);
2832
2833 kfree(snapshot);
2834 }
2835
2836 return 0;
2837}
2838
2839static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2840{
2841 struct btrfs_inode *btrfs_inode;
2842 struct list_head splice;
2843
2844 INIT_LIST_HEAD(&splice);
2845
2846 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2847
2848 spin_lock(&root->fs_info->delalloc_lock);
2849
2850 while (!list_empty(&splice)) {
2851 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2852 delalloc_inodes);
2853
2854 list_del_init(&btrfs_inode->delalloc_inodes);
2855
2856 btrfs_invalidate_inodes(btrfs_inode->root);
2857 }
2858
2859 spin_unlock(&root->fs_info->delalloc_lock);
2860
2861 return 0;
2862}
2863
2864static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2865 struct extent_io_tree *dirty_pages,
2866 int mark)
2867{
2868 int ret;
2869 struct page *page;
2870 struct inode *btree_inode = root->fs_info->btree_inode;
2871 struct extent_buffer *eb;
2872 u64 start = 0;
2873 u64 end;
2874 u64 offset;
2875 unsigned long index;
2876
2877 while (1) {
2878 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2879 mark);
2880 if (ret)
2881 break;
2882
2883 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2884 while (start <= end) {
2885 index = start >> PAGE_CACHE_SHIFT;
2886 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2887 page = find_get_page(btree_inode->i_mapping, index);
2888 if (!page)
2889 continue;
2890 offset = page_offset(page);
2891
2892 spin_lock(&dirty_pages->buffer_lock);
2893 eb = radix_tree_lookup(
2894 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2895 offset >> PAGE_CACHE_SHIFT);
2896 spin_unlock(&dirty_pages->buffer_lock);
2897 if (eb) {
2898 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2899 &eb->bflags);
2900 atomic_set(&eb->refs, 1);
2901 }
2902 if (PageWriteback(page))
2903 end_page_writeback(page);
2904
2905 lock_page(page);
2906 if (PageDirty(page)) {
2907 clear_page_dirty_for_io(page);
2908 spin_lock_irq(&page->mapping->tree_lock);
2909 radix_tree_tag_clear(&page->mapping->page_tree,
2910 page_index(page),
2911 PAGECACHE_TAG_DIRTY);
2912 spin_unlock_irq(&page->mapping->tree_lock);
2913 }
2914
2915 page->mapping->a_ops->invalidatepage(page, 0);
2916 unlock_page(page);
2917 }
2918 }
2919
2920 return ret;
2921}
2922
2923static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2924 struct extent_io_tree *pinned_extents)
2925{
2926 struct extent_io_tree *unpin;
2927 u64 start;
2928 u64 end;
2929 int ret;
2930
2931 unpin = pinned_extents;
2932 while (1) {
2933 ret = find_first_extent_bit(unpin, 0, &start, &end,
2934 EXTENT_DIRTY);
2935 if (ret)
2936 break;
2937
2938 /* opt_discard */
2939 ret = btrfs_error_discard_extent(root, start, end + 1 - start);
2940
2941 clear_extent_dirty(unpin, start, end, GFP_NOFS);
2942 btrfs_error_unpin_extent_range(root, start, end);
2943 cond_resched();
2944 }
2945
2946 return 0;
2947}
2948
2949static int btrfs_cleanup_transaction(struct btrfs_root *root)
2950{
2951 struct btrfs_transaction *t;
2952 LIST_HEAD(list);
2953
2954 WARN_ON(1);
2955
2956 mutex_lock(&root->fs_info->trans_mutex);
2957 mutex_lock(&root->fs_info->transaction_kthread_mutex);
2958
2959 list_splice_init(&root->fs_info->trans_list, &list);
2960 while (!list_empty(&list)) {
2961 t = list_entry(list.next, struct btrfs_transaction, list);
2962 if (!t)
2963 break;
2964
2965 btrfs_destroy_ordered_operations(root);
2966
2967 btrfs_destroy_ordered_extents(root);
2968
2969 btrfs_destroy_delayed_refs(t, root);
2970
2971 btrfs_block_rsv_release(root,
2972 &root->fs_info->trans_block_rsv,
2973 t->dirty_pages.dirty_bytes);
2974
2975 /* FIXME: cleanup wait for commit */
2976 t->in_commit = 1;
2977 t->blocked = 1;
2978 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
2979 wake_up(&root->fs_info->transaction_blocked_wait);
2980
2981 t->blocked = 0;
2982 if (waitqueue_active(&root->fs_info->transaction_wait))
2983 wake_up(&root->fs_info->transaction_wait);
2984 mutex_unlock(&root->fs_info->trans_mutex);
2985
2986 mutex_lock(&root->fs_info->trans_mutex);
2987 t->commit_done = 1;
2988 if (waitqueue_active(&t->commit_wait))
2989 wake_up(&t->commit_wait);
2990 mutex_unlock(&root->fs_info->trans_mutex);
2991
2992 mutex_lock(&root->fs_info->trans_mutex);
2993
2994 btrfs_destroy_pending_snapshots(t);
2995
2996 btrfs_destroy_delalloc_inodes(root);
2997
2998 spin_lock(&root->fs_info->new_trans_lock);
2999 root->fs_info->running_transaction = NULL;
3000 spin_unlock(&root->fs_info->new_trans_lock);
3001
3002 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3003 EXTENT_DIRTY);
3004
3005 btrfs_destroy_pinned_extent(root,
3006 root->fs_info->pinned_extents);
3007
3008 t->use_count = 0;
3009 list_del_init(&t->list);
3010 memset(t, 0, sizeof(*t));
3011 kmem_cache_free(btrfs_transaction_cachep, t);
3012 }
3013
3014 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3015 mutex_unlock(&root->fs_info->trans_mutex);
3016
3017 return 0;
3018}
3019
2603static struct extent_io_ops btree_extent_io_ops = { 3020static struct extent_io_ops btree_extent_io_ops = {
2604 .write_cache_pages_lock_hook = btree_lock_page_hook, 3021 .write_cache_pages_lock_hook = btree_lock_page_hook,
2605 .readpage_end_io_hook = btree_readpage_end_io_hook, 3022 .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..07b20dc2fd95 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,6 +52,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors); 52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root); 54int btrfs_commit_super(struct btrfs_root *root);
55int btrfs_error_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 56struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize); 57 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, 58struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
@@ -87,7 +88,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 88 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 89int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
89 int rw, struct bio *bio, int mirror_num, 90 int rw, struct bio *bio, int mirror_num,
90 unsigned long bio_flags, 91 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 92 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 93 extent_submit_bio_hook_t *submit_bio_done);
93 94
@@ -95,8 +96,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 96unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 97int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 98int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
98int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
99 struct btrfs_fs_info *fs_info);
100int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, 99int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
101 struct btrfs_fs_info *fs_info); 100 struct btrfs_fs_info *fs_info);
102int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 101int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..9786963b07e5 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -65,7 +65,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
65{ 65{
66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
67 struct btrfs_root *root; 67 struct btrfs_root *root;
68 struct dentry *dentry;
69 struct inode *inode; 68 struct inode *inode;
70 struct btrfs_key key; 69 struct btrfs_key key;
71 int index; 70 int index;
@@ -95,7 +94,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 94 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
96 key.offset = 0; 95 key.offset = 0;
97 96
98 inode = btrfs_iget(sb, &key, root); 97 inode = btrfs_iget(sb, &key, root, NULL);
99 if (IS_ERR(inode)) { 98 if (IS_ERR(inode)) {
100 err = PTR_ERR(inode); 99 err = PTR_ERR(inode);
101 goto fail; 100 goto fail;
@@ -108,10 +107,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
108 return ERR_PTR(-ESTALE); 107 return ERR_PTR(-ESTALE);
109 } 108 }
110 109
111 dentry = d_obtain_alias(inode); 110 return d_obtain_alias(inode);
112 if (!IS_ERR(dentry))
113 dentry->d_op = &btrfs_dentry_operations;
114 return dentry;
115fail: 111fail:
116 srcu_read_unlock(&fs_info->subvol_srcu, index); 112 srcu_read_unlock(&fs_info->subvol_srcu, index);
117 return ERR_PTR(err); 113 return ERR_PTR(err);
@@ -166,7 +162,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
166static struct dentry *btrfs_get_parent(struct dentry *child) 162static struct dentry *btrfs_get_parent(struct dentry *child)
167{ 163{
168 struct inode *dir = child->d_inode; 164 struct inode *dir = child->d_inode;
169 static struct dentry *dentry;
170 struct btrfs_root *root = BTRFS_I(dir)->root; 165 struct btrfs_root *root = BTRFS_I(dir)->root;
171 struct btrfs_path *path; 166 struct btrfs_path *path;
172 struct extent_buffer *leaf; 167 struct extent_buffer *leaf;
@@ -223,18 +218,91 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 218
224 key.type = BTRFS_INODE_ITEM_KEY; 219 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 220 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 221 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry;
230fail: 222fail:
231 btrfs_free_path(path); 223 btrfs_free_path(path);
232 return ERR_PTR(ret); 224 return ERR_PTR(ret);
233} 225}
234 226
227static int btrfs_get_name(struct dentry *parent, char *name,
228 struct dentry *child)
229{
230 struct inode *inode = child->d_inode;
231 struct inode *dir = parent->d_inode;
232 struct btrfs_path *path;
233 struct btrfs_root *root = BTRFS_I(dir)->root;
234 struct btrfs_inode_ref *iref;
235 struct btrfs_root_ref *rref;
236 struct extent_buffer *leaf;
237 unsigned long name_ptr;
238 struct btrfs_key key;
239 int name_len;
240 int ret;
241
242 if (!dir || !inode)
243 return -EINVAL;
244
245 if (!S_ISDIR(dir->i_mode))
246 return -EINVAL;
247
248 path = btrfs_alloc_path();
249 if (!path)
250 return -ENOMEM;
251 path->leave_spinning = 1;
252
253 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
254 key.objectid = BTRFS_I(inode)->root->root_key.objectid;
255 key.type = BTRFS_ROOT_BACKREF_KEY;
256 key.offset = (u64)-1;
257 root = root->fs_info->tree_root;
258 } else {
259 key.objectid = inode->i_ino;
260 key.offset = dir->i_ino;
261 key.type = BTRFS_INODE_REF_KEY;
262 }
263
264 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
265 if (ret < 0) {
266 btrfs_free_path(path);
267 return ret;
268 } else if (ret > 0) {
269 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
270 path->slots[0]--;
271 } else {
272 btrfs_free_path(path);
273 return -ENOENT;
274 }
275 }
276 leaf = path->nodes[0];
277
278 if (inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
279 rref = btrfs_item_ptr(leaf, path->slots[0],
280 struct btrfs_root_ref);
281 name_ptr = (unsigned long)(rref + 1);
282 name_len = btrfs_root_ref_name_len(leaf, rref);
283 } else {
284 iref = btrfs_item_ptr(leaf, path->slots[0],
285 struct btrfs_inode_ref);
286 name_ptr = (unsigned long)(iref + 1);
287 name_len = btrfs_inode_ref_name_len(leaf, iref);
288 }
289
290 read_extent_buffer(leaf, name, name_ptr, name_len);
291 btrfs_free_path(path);
292
293 /*
294 * have to add the null termination to make sure that reconnect_path
295 * gets the right len for strlen
296 */
297 name[name_len] = '\0';
298
299 return 0;
300}
301
235const struct export_operations btrfs_export_ops = { 302const struct export_operations btrfs_export_ops = {
236 .encode_fh = btrfs_encode_fh, 303 .encode_fh = btrfs_encode_fh,
237 .fh_to_dentry = btrfs_fh_to_dentry, 304 .fh_to_dentry = btrfs_fh_to_dentry,
238 .fh_to_parent = btrfs_fh_to_parent, 305 .fh_to_parent = btrfs_fh_to_parent,
239 .get_parent = btrfs_get_parent, 306 .get_parent = btrfs_get_parent,
307 .get_name = btrfs_get_name,
240}; 308};
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 432a2da4641e..b55269340cec 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -22,6 +22,7 @@
22#include <linux/sort.h> 22#include <linux/sort.h>
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h>
25#include "compat.h" 26#include "compat.h"
26#include "hash.h" 27#include "hash.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -34,10 +35,9 @@
34 35
35static int update_block_group(struct btrfs_trans_handle *trans, 36static int update_block_group(struct btrfs_trans_handle *trans,
36 struct btrfs_root *root, 37 struct btrfs_root *root,
37 u64 bytenr, u64 num_bytes, int alloc, 38 u64 bytenr, u64 num_bytes, int alloc);
38 int mark_free); 39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
39static int update_reserved_extents(struct btrfs_block_group_cache *cache, 40 u64 num_bytes, int reserve, int sinfo);
40 u64 num_bytes, int reserve);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 41static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 42 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 43 u64 bytenr, u64 num_bytes, u64 parent,
@@ -60,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
60static int do_chunk_alloc(struct btrfs_trans_handle *trans, 60static int do_chunk_alloc(struct btrfs_trans_handle *trans,
61 struct btrfs_root *extent_root, u64 alloc_bytes, 61 struct btrfs_root *extent_root, u64 alloc_bytes,
62 u64 flags, int force); 62 u64 flags, int force);
63static int pin_down_bytes(struct btrfs_trans_handle *trans,
64 struct btrfs_root *root,
65 struct btrfs_path *path,
66 u64 bytenr, u64 num_bytes,
67 int is_data, int reserved,
68 struct extent_buffer **must_clean);
69static int find_next_key(struct btrfs_path *path, int level, 63static int find_next_key(struct btrfs_path *path, int level,
70 struct btrfs_key *key); 64 struct btrfs_key *key);
71static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 65static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -90,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
90 84
91void btrfs_put_block_group(struct btrfs_block_group_cache *cache) 85void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
92{ 86{
93 if (atomic_dec_and_test(&cache->count)) 87 if (atomic_dec_and_test(&cache->count)) {
88 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0);
94 kfree(cache); 91 kfree(cache);
92 }
95} 93}
96 94
97/* 95/*
@@ -244,6 +242,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
244 return NULL; 242 return NULL;
245 } 243 }
246 244
245 /* We're loading it the fast way, so we don't have a caching_ctl. */
246 if (!cache->caching_ctl) {
247 spin_unlock(&cache->lock);
248 return NULL;
249 }
250
247 ctl = cache->caching_ctl; 251 ctl = cache->caching_ctl;
248 atomic_inc(&ctl->count); 252 atomic_inc(&ctl->count);
249 spin_unlock(&cache->lock); 253 spin_unlock(&cache->lock);
@@ -318,7 +322,7 @@ static int caching_kthread(void *data)
318 322
319 exclude_super_stripes(extent_root, block_group); 323 exclude_super_stripes(extent_root, block_group);
320 spin_lock(&block_group->space_info->lock); 324 spin_lock(&block_group->space_info->lock);
321 block_group->space_info->bytes_super += block_group->bytes_super; 325 block_group->space_info->bytes_readonly += block_group->bytes_super;
322 spin_unlock(&block_group->space_info->lock); 326 spin_unlock(&block_group->space_info->lock);
323 327
324 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 328 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -423,7 +427,10 @@ err:
423 return 0; 427 return 0;
424} 428}
425 429
426static int cache_block_group(struct btrfs_block_group_cache *cache) 430static int cache_block_group(struct btrfs_block_group_cache *cache,
431 struct btrfs_trans_handle *trans,
432 struct btrfs_root *root,
433 int load_cache_only)
427{ 434{
428 struct btrfs_fs_info *fs_info = cache->fs_info; 435 struct btrfs_fs_info *fs_info = cache->fs_info;
429 struct btrfs_caching_control *caching_ctl; 436 struct btrfs_caching_control *caching_ctl;
@@ -434,6 +441,39 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
434 if (cache->cached != BTRFS_CACHE_NO) 441 if (cache->cached != BTRFS_CACHE_NO)
435 return 0; 442 return 0;
436 443
444 /*
445 * We can't do the read from on-disk cache during a commit since we need
446 * to have the normal tree locking. Also if we are currently trying to
447 * allocate blocks for the tree root we can't do the fast caching since
448 * we likely hold important locks.
449 */
450 if (!trans->transaction->in_commit &&
451 (root && root != root->fs_info->tree_root)) {
452 spin_lock(&cache->lock);
453 if (cache->cached != BTRFS_CACHE_NO) {
454 spin_unlock(&cache->lock);
455 return 0;
456 }
457 cache->cached = BTRFS_CACHE_STARTED;
458 spin_unlock(&cache->lock);
459
460 ret = load_free_space_cache(fs_info, cache);
461
462 spin_lock(&cache->lock);
463 if (ret == 1) {
464 cache->cached = BTRFS_CACHE_FINISHED;
465 cache->last_byte_to_unpin = (u64)-1;
466 } else {
467 cache->cached = BTRFS_CACHE_NO;
468 }
469 spin_unlock(&cache->lock);
470 if (ret == 1)
471 return 0;
472 }
473
474 if (load_cache_only)
475 return 0;
476
437 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 477 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
438 BUG_ON(!caching_ctl); 478 BUG_ON(!caching_ctl);
439 479
@@ -506,9 +546,12 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
506 struct list_head *head = &info->space_info; 546 struct list_head *head = &info->space_info;
507 struct btrfs_space_info *found; 547 struct btrfs_space_info *found;
508 548
549 flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
550 BTRFS_BLOCK_GROUP_METADATA;
551
509 rcu_read_lock(); 552 rcu_read_lock();
510 list_for_each_entry_rcu(found, head, list) { 553 list_for_each_entry_rcu(found, head, list) {
511 if (found->flags == flags) { 554 if (found->flags & flags) {
512 rcu_read_unlock(); 555 rcu_read_unlock();
513 return found; 556 return found;
514 } 557 }
@@ -541,6 +584,15 @@ static u64 div_factor(u64 num, int factor)
541 return num; 584 return num;
542} 585}
543 586
587static u64 div_factor_fine(u64 num, int factor)
588{
589 if (factor == 100)
590 return num;
591 num *= factor;
592 do_div(num, 100);
593 return num;
594}
595
544u64 btrfs_find_block_group(struct btrfs_root *root, 596u64 btrfs_find_block_group(struct btrfs_root *root,
545 u64 search_start, u64 search_hint, int owner) 597 u64 search_start, u64 search_hint, int owner)
546{ 598{
@@ -609,6 +661,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
609} 661}
610 662
611/* 663/*
664 * helper function to lookup reference count and flags of extent.
665 *
666 * the head node for delayed ref is used to store the sum of all the
667 * reference count modifications queued up in the rbtree. the head
668 * node may also store the extent flags to set. This way you can check
669 * to see what the reference count and extent flags would be if all of
670 * the delayed refs are not processed.
671 */
672int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
673 struct btrfs_root *root, u64 bytenr,
674 u64 num_bytes, u64 *refs, u64 *flags)
675{
676 struct btrfs_delayed_ref_head *head;
677 struct btrfs_delayed_ref_root *delayed_refs;
678 struct btrfs_path *path;
679 struct btrfs_extent_item *ei;
680 struct extent_buffer *leaf;
681 struct btrfs_key key;
682 u32 item_size;
683 u64 num_refs;
684 u64 extent_flags;
685 int ret;
686
687 path = btrfs_alloc_path();
688 if (!path)
689 return -ENOMEM;
690
691 key.objectid = bytenr;
692 key.type = BTRFS_EXTENT_ITEM_KEY;
693 key.offset = num_bytes;
694 if (!trans) {
695 path->skip_locking = 1;
696 path->search_commit_root = 1;
697 }
698again:
699 ret = btrfs_search_slot(trans, root->fs_info->extent_root,
700 &key, path, 0, 0);
701 if (ret < 0)
702 goto out_free;
703
704 if (ret == 0) {
705 leaf = path->nodes[0];
706 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
707 if (item_size >= sizeof(*ei)) {
708 ei = btrfs_item_ptr(leaf, path->slots[0],
709 struct btrfs_extent_item);
710 num_refs = btrfs_extent_refs(leaf, ei);
711 extent_flags = btrfs_extent_flags(leaf, ei);
712 } else {
713#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
714 struct btrfs_extent_item_v0 *ei0;
715 BUG_ON(item_size != sizeof(*ei0));
716 ei0 = btrfs_item_ptr(leaf, path->slots[0],
717 struct btrfs_extent_item_v0);
718 num_refs = btrfs_extent_refs_v0(leaf, ei0);
719 /* FIXME: this isn't correct for data */
720 extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
721#else
722 BUG();
723#endif
724 }
725 BUG_ON(num_refs == 0);
726 } else {
727 num_refs = 0;
728 extent_flags = 0;
729 ret = 0;
730 }
731
732 if (!trans)
733 goto out;
734
735 delayed_refs = &trans->transaction->delayed_refs;
736 spin_lock(&delayed_refs->lock);
737 head = btrfs_find_delayed_ref_head(trans, bytenr);
738 if (head) {
739 if (!mutex_trylock(&head->mutex)) {
740 atomic_inc(&head->node.refs);
741 spin_unlock(&delayed_refs->lock);
742
743 btrfs_release_path(root->fs_info->extent_root, path);
744
745 mutex_lock(&head->mutex);
746 mutex_unlock(&head->mutex);
747 btrfs_put_delayed_ref(&head->node);
748 goto again;
749 }
750 if (head->extent_op && head->extent_op->update_flags)
751 extent_flags |= head->extent_op->flags_to_set;
752 else
753 BUG_ON(num_refs == 0);
754
755 num_refs += head->node.ref_mod;
756 mutex_unlock(&head->mutex);
757 }
758 spin_unlock(&delayed_refs->lock);
759out:
760 WARN_ON(num_refs == 0);
761 if (refs)
762 *refs = num_refs;
763 if (flags)
764 *flags = extent_flags;
765out_free:
766 btrfs_free_path(path);
767 return ret;
768}
769
770/*
612 * Back reference rules. Back refs have three main goals: 771 * Back reference rules. Back refs have three main goals:
613 * 772 *
614 * 1) differentiate between all holders of references to an extent so that 773 * 1) differentiate between all holders of references to an extent so that
@@ -1587,8 +1746,7 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1587static void btrfs_issue_discard(struct block_device *bdev, 1746static void btrfs_issue_discard(struct block_device *bdev,
1588 u64 start, u64 len) 1747 u64 start, u64 len)
1589{ 1748{
1590 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1749 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0);
1591 DISCARD_FL_BARRIER);
1592} 1750}
1593 1751
1594static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1752static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
@@ -1870,7 +2028,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1870 return ret; 2028 return ret;
1871} 2029}
1872 2030
1873
1874/* helper function to actually process a single delayed ref entry */ 2031/* helper function to actually process a single delayed ref entry */
1875static int run_one_delayed_ref(struct btrfs_trans_handle *trans, 2032static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1876 struct btrfs_root *root, 2033 struct btrfs_root *root,
@@ -1890,32 +2047,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1890 BUG_ON(extent_op); 2047 BUG_ON(extent_op);
1891 head = btrfs_delayed_node_to_head(node); 2048 head = btrfs_delayed_node_to_head(node);
1892 if (insert_reserved) { 2049 if (insert_reserved) {
1893 int mark_free = 0; 2050 btrfs_pin_extent(root, node->bytenr,
1894 struct extent_buffer *must_clean = NULL; 2051 node->num_bytes, 1);
1895
1896 ret = pin_down_bytes(trans, root, NULL,
1897 node->bytenr, node->num_bytes,
1898 head->is_data, 1, &must_clean);
1899 if (ret > 0)
1900 mark_free = 1;
1901
1902 if (must_clean) {
1903 clean_tree_block(NULL, root, must_clean);
1904 btrfs_tree_unlock(must_clean);
1905 free_extent_buffer(must_clean);
1906 }
1907 if (head->is_data) { 2052 if (head->is_data) {
1908 ret = btrfs_del_csums(trans, root, 2053 ret = btrfs_del_csums(trans, root,
1909 node->bytenr, 2054 node->bytenr,
1910 node->num_bytes); 2055 node->num_bytes);
1911 BUG_ON(ret); 2056 BUG_ON(ret);
1912 } 2057 }
1913 if (mark_free) {
1914 ret = btrfs_free_reserved_extent(root,
1915 node->bytenr,
1916 node->num_bytes);
1917 BUG_ON(ret);
1918 }
1919 } 2058 }
1920 mutex_unlock(&head->mutex); 2059 mutex_unlock(&head->mutex);
1921 return 0; 2060 return 0;
@@ -2346,6 +2485,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2346 ret = 0; 2485 ret = 0;
2347out: 2486out:
2348 btrfs_free_path(path); 2487 btrfs_free_path(path);
2488 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2489 WARN_ON(ret > 0);
2349 return ret; 2490 return ret;
2350} 2491}
2351 2492
@@ -2597,6 +2738,111 @@ next_block_group(struct btrfs_root *root,
2597 return cache; 2738 return cache;
2598} 2739}
2599 2740
2741static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2742 struct btrfs_trans_handle *trans,
2743 struct btrfs_path *path)
2744{
2745 struct btrfs_root *root = block_group->fs_info->tree_root;
2746 struct inode *inode = NULL;
2747 u64 alloc_hint = 0;
2748 int dcs = BTRFS_DC_ERROR;
2749 int num_pages = 0;
2750 int retries = 0;
2751 int ret = 0;
2752
2753 /*
2754 * If this block group is smaller than 100 megs don't bother caching the
2755 * block group.
2756 */
2757 if (block_group->key.offset < (100 * 1024 * 1024)) {
2758 spin_lock(&block_group->lock);
2759 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2760 spin_unlock(&block_group->lock);
2761 return 0;
2762 }
2763
2764again:
2765 inode = lookup_free_space_inode(root, block_group, path);
2766 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2767 ret = PTR_ERR(inode);
2768 btrfs_release_path(root, path);
2769 goto out;
2770 }
2771
2772 if (IS_ERR(inode)) {
2773 BUG_ON(retries);
2774 retries++;
2775
2776 if (block_group->ro)
2777 goto out_free;
2778
2779 ret = create_free_space_inode(root, trans, block_group, path);
2780 if (ret)
2781 goto out_free;
2782 goto again;
2783 }
2784
2785 /*
2786 * We want to set the generation to 0, that way if anything goes wrong
2787 * from here on out we know not to trust this cache when we load up next
2788 * time.
2789 */
2790 BTRFS_I(inode)->generation = 0;
2791 ret = btrfs_update_inode(trans, root, inode);
2792 WARN_ON(ret);
2793
2794 if (i_size_read(inode) > 0) {
2795 ret = btrfs_truncate_free_space_cache(root, trans, path,
2796 inode);
2797 if (ret)
2798 goto out_put;
2799 }
2800
2801 spin_lock(&block_group->lock);
2802 if (block_group->cached != BTRFS_CACHE_FINISHED) {
2803 /* We're not cached, don't bother trying to write stuff out */
2804 dcs = BTRFS_DC_WRITTEN;
2805 spin_unlock(&block_group->lock);
2806 goto out_put;
2807 }
2808 spin_unlock(&block_group->lock);
2809
2810 num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2811 if (!num_pages)
2812 num_pages = 1;
2813
2814 /*
2815 * Just to make absolutely sure we have enough space, we're going to
2816 * preallocate 12 pages worth of space for each block group. In
2817 * practice we ought to use at most 8, but we need extra space so we can
2818 * add our header and have a terminator between the extents and the
2819 * bitmaps.
2820 */
2821 num_pages *= 16;
2822 num_pages *= PAGE_CACHE_SIZE;
2823
2824 ret = btrfs_check_data_free_space(inode, num_pages);
2825 if (ret)
2826 goto out_put;
2827
2828 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2829 num_pages, num_pages,
2830 &alloc_hint);
2831 if (!ret)
2832 dcs = BTRFS_DC_SETUP;
2833 btrfs_free_reserved_data_space(inode, num_pages);
2834out_put:
2835 iput(inode);
2836out_free:
2837 btrfs_release_path(root, path);
2838out:
2839 spin_lock(&block_group->lock);
2840 block_group->disk_cache_state = dcs;
2841 spin_unlock(&block_group->lock);
2842
2843 return ret;
2844}
2845
2600int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2846int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2601 struct btrfs_root *root) 2847 struct btrfs_root *root)
2602{ 2848{
@@ -2609,6 +2855,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2609 if (!path) 2855 if (!path)
2610 return -ENOMEM; 2856 return -ENOMEM;
2611 2857
2858again:
2859 while (1) {
2860 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2861 while (cache) {
2862 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2863 break;
2864 cache = next_block_group(root, cache);
2865 }
2866 if (!cache) {
2867 if (last == 0)
2868 break;
2869 last = 0;
2870 continue;
2871 }
2872 err = cache_save_setup(cache, trans, path);
2873 last = cache->key.objectid + cache->key.offset;
2874 btrfs_put_block_group(cache);
2875 }
2876
2612 while (1) { 2877 while (1) {
2613 if (last == 0) { 2878 if (last == 0) {
2614 err = btrfs_run_delayed_refs(trans, root, 2879 err = btrfs_run_delayed_refs(trans, root,
@@ -2618,6 +2883,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2618 2883
2619 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2884 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2620 while (cache) { 2885 while (cache) {
2886 if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2887 btrfs_put_block_group(cache);
2888 goto again;
2889 }
2890
2621 if (cache->dirty) 2891 if (cache->dirty)
2622 break; 2892 break;
2623 cache = next_block_group(root, cache); 2893 cache = next_block_group(root, cache);
@@ -2629,6 +2899,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2629 continue; 2899 continue;
2630 } 2900 }
2631 2901
2902 if (cache->disk_cache_state == BTRFS_DC_SETUP)
2903 cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2632 cache->dirty = 0; 2904 cache->dirty = 0;
2633 last = cache->key.objectid + cache->key.offset; 2905 last = cache->key.objectid + cache->key.offset;
2634 2906
@@ -2637,6 +2909,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2637 btrfs_put_block_group(cache); 2909 btrfs_put_block_group(cache);
2638 } 2910 }
2639 2911
2912 while (1) {
2913 /*
2914 * I don't think this is needed since we're just marking our
2915 * preallocated extent as written, but just in case it can't
2916 * hurt.
2917 */
2918 if (last == 0) {
2919 err = btrfs_run_delayed_refs(trans, root,
2920 (unsigned long)-1);
2921 BUG_ON(err);
2922 }
2923
2924 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2925 while (cache) {
2926 /*
2927 * Really this shouldn't happen, but it could if we
2928 * couldn't write the entire preallocated extent and
2929 * splitting the extent resulted in a new block.
2930 */
2931 if (cache->dirty) {
2932 btrfs_put_block_group(cache);
2933 goto again;
2934 }
2935 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2936 break;
2937 cache = next_block_group(root, cache);
2938 }
2939 if (!cache) {
2940 if (last == 0)
2941 break;
2942 last = 0;
2943 continue;
2944 }
2945
2946 btrfs_write_out_cache(root, trans, cache, path);
2947
2948 /*
2949 * If we didn't have an error then the cache state is still
2950 * NEED_WRITE, so we can set it to WRITTEN.
2951 */
2952 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2953 cache->disk_cache_state = BTRFS_DC_WRITTEN;
2954 last = cache->key.objectid + cache->key.offset;
2955 btrfs_put_block_group(cache);
2956 }
2957
2640 btrfs_free_path(path); 2958 btrfs_free_path(path);
2641 return 0; 2959 return 0;
2642} 2960}
@@ -2659,12 +2977,22 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2659 struct btrfs_space_info **space_info) 2977 struct btrfs_space_info **space_info)
2660{ 2978{
2661 struct btrfs_space_info *found; 2979 struct btrfs_space_info *found;
2980 int i;
2981 int factor;
2982
2983 if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
2984 BTRFS_BLOCK_GROUP_RAID10))
2985 factor = 2;
2986 else
2987 factor = 1;
2662 2988
2663 found = __find_space_info(info, flags); 2989 found = __find_space_info(info, flags);
2664 if (found) { 2990 if (found) {
2665 spin_lock(&found->lock); 2991 spin_lock(&found->lock);
2666 found->total_bytes += total_bytes; 2992 found->total_bytes += total_bytes;
2993 found->disk_total += total_bytes * factor;
2667 found->bytes_used += bytes_used; 2994 found->bytes_used += bytes_used;
2995 found->disk_used += bytes_used * factor;
2668 found->full = 0; 2996 found->full = 0;
2669 spin_unlock(&found->lock); 2997 spin_unlock(&found->lock);
2670 *space_info = found; 2998 *space_info = found;
@@ -2674,16 +3002,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2674 if (!found) 3002 if (!found)
2675 return -ENOMEM; 3003 return -ENOMEM;
2676 3004
2677 INIT_LIST_HEAD(&found->block_groups); 3005 for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3006 INIT_LIST_HEAD(&found->block_groups[i]);
2678 init_rwsem(&found->groups_sem); 3007 init_rwsem(&found->groups_sem);
2679 spin_lock_init(&found->lock); 3008 spin_lock_init(&found->lock);
2680 found->flags = flags; 3009 found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
3010 BTRFS_BLOCK_GROUP_SYSTEM |
3011 BTRFS_BLOCK_GROUP_METADATA);
2681 found->total_bytes = total_bytes; 3012 found->total_bytes = total_bytes;
3013 found->disk_total = total_bytes * factor;
2682 found->bytes_used = bytes_used; 3014 found->bytes_used = bytes_used;
3015 found->disk_used = bytes_used * factor;
2683 found->bytes_pinned = 0; 3016 found->bytes_pinned = 0;
2684 found->bytes_reserved = 0; 3017 found->bytes_reserved = 0;
2685 found->bytes_readonly = 0; 3018 found->bytes_readonly = 0;
2686 found->bytes_delalloc = 0; 3019 found->bytes_may_use = 0;
2687 found->full = 0; 3020 found->full = 0;
2688 found->force_alloc = 0; 3021 found->force_alloc = 0;
2689 *space_info = found; 3022 *space_info = found;
@@ -2708,22 +3041,15 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2708 } 3041 }
2709} 3042}
2710 3043
2711static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
2712{
2713 spin_lock(&cache->space_info->lock);
2714 spin_lock(&cache->lock);
2715 if (!cache->ro) {
2716 cache->space_info->bytes_readonly += cache->key.offset -
2717 btrfs_block_group_used(&cache->item);
2718 cache->ro = 1;
2719 }
2720 spin_unlock(&cache->lock);
2721 spin_unlock(&cache->space_info->lock);
2722}
2723
2724u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 3044u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2725{ 3045{
2726 u64 num_devices = root->fs_info->fs_devices->rw_devices; 3046 /*
3047 * we add in the count of missing devices because we want
3048 * to make sure that any RAID levels on a degraded FS
3049 * continue to be honored.
3050 */
3051 u64 num_devices = root->fs_info->fs_devices->rw_devices +
3052 root->fs_info->fs_devices->missing_devices;
2727 3053
2728 if (num_devices == 1) 3054 if (num_devices == 1)
2729 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 3055 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -2749,718 +3075,995 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2749 return flags; 3075 return flags;
2750} 3076}
2751 3077
2752static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) 3078static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2753{ 3079{
2754 struct btrfs_fs_info *info = root->fs_info; 3080 if (flags & BTRFS_BLOCK_GROUP_DATA)
2755 u64 alloc_profile; 3081 flags |= root->fs_info->avail_data_alloc_bits &
2756 3082 root->fs_info->data_alloc_profile;
2757 if (data) { 3083 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
2758 alloc_profile = info->avail_data_alloc_bits & 3084 flags |= root->fs_info->avail_system_alloc_bits &
2759 info->data_alloc_profile; 3085 root->fs_info->system_alloc_profile;
2760 data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; 3086 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
2761 } else if (root == root->fs_info->chunk_root) { 3087 flags |= root->fs_info->avail_metadata_alloc_bits &
2762 alloc_profile = info->avail_system_alloc_bits & 3088 root->fs_info->metadata_alloc_profile;
2763 info->system_alloc_profile; 3089 return btrfs_reduce_alloc_profile(root, flags);
2764 data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; 3090}
2765 } else { 3091
2766 alloc_profile = info->avail_metadata_alloc_bits & 3092u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2767 info->metadata_alloc_profile; 3093{
2768 data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; 3094 u64 flags;
2769 } 3095
3096 if (data)
3097 flags = BTRFS_BLOCK_GROUP_DATA;
3098 else if (root == root->fs_info->chunk_root)
3099 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3100 else
3101 flags = BTRFS_BLOCK_GROUP_METADATA;
2770 3102
2771 return btrfs_reduce_alloc_profile(root, data); 3103 return get_alloc_profile(root, flags);
2772} 3104}
2773 3105
2774void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) 3106void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2775{ 3107{
2776 u64 alloc_target;
2777
2778 alloc_target = btrfs_get_alloc_profile(root, 1);
2779 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, 3108 BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2780 alloc_target); 3109 BTRFS_BLOCK_GROUP_DATA);
2781} 3110}
2782 3111
2783static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items) 3112/*
3113 * This will check the space that the inode allocates from to make sure we have
3114 * enough space for bytes.
3115 */
3116int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
2784{ 3117{
2785 u64 num_bytes; 3118 struct btrfs_space_info *data_sinfo;
2786 int level; 3119 struct btrfs_root *root = BTRFS_I(inode)->root;
3120 u64 used;
3121 int ret = 0, committed = 0, alloc_chunk = 1;
2787 3122
2788 level = BTRFS_MAX_LEVEL - 2; 3123 /* make sure bytes are sectorsize aligned */
2789 /* 3124 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2790 * NOTE: these calculations are absolutely the worst possible case.
2791 * This assumes that _every_ item we insert will require a new leaf, and
2792 * that the tree has grown to its maximum level size.
2793 */
2794 3125
2795 /* 3126 if (root == root->fs_info->tree_root) {
2796 * for every item we insert we could insert both an extent item and a 3127 alloc_chunk = 0;
2797 * extent ref item. Then for ever item we insert, we will need to cow 3128 committed = 1;
2798 * both the original leaf, plus the leaf to the left and right of it. 3129 }
2799 *
2800 * Unless we are talking about the extent root, then we just want the
2801 * number of items * 2, since we just need the extent item plus its ref.
2802 */
2803 if (root == root->fs_info->extent_root)
2804 num_bytes = num_items * 2;
2805 else
2806 num_bytes = (num_items + (2 * num_items)) * 3;
2807 3130
2808 /* 3131 data_sinfo = BTRFS_I(inode)->space_info;
2809 * num_bytes is total number of leaves we could need times the leaf 3132 if (!data_sinfo)
2810 * size, and then for every leaf we could end up cow'ing 2 nodes per 3133 goto alloc;
2811 * level, down to the leaf level.
2812 */
2813 num_bytes = (num_bytes * root->leafsize) +
2814 (num_bytes * (level * 2)) * root->nodesize;
2815 3134
2816 return num_bytes; 3135again:
2817} 3136 /* make sure we have enough space to handle the data first */
3137 spin_lock(&data_sinfo->lock);
3138 used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3139 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3140 data_sinfo->bytes_may_use;
2818 3141
2819/* 3142 if (used + bytes > data_sinfo->total_bytes) {
2820 * Unreserve metadata space for delalloc. If we have less reserved credits than 3143 struct btrfs_trans_handle *trans;
2821 * we have extents, this function does nothing.
2822 */
2823int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
2824 struct inode *inode, int num_items)
2825{
2826 struct btrfs_fs_info *info = root->fs_info;
2827 struct btrfs_space_info *meta_sinfo;
2828 u64 num_bytes;
2829 u64 alloc_target;
2830 bool bug = false;
2831 3144
2832 /* get the space info for where the metadata will live */ 3145 /*
2833 alloc_target = btrfs_get_alloc_profile(root, 0); 3146 * if we don't have enough free bytes in this space then we need
2834 meta_sinfo = __find_space_info(info, alloc_target); 3147 * to alloc a new chunk.
3148 */
3149 if (!data_sinfo->full && alloc_chunk) {
3150 u64 alloc_target;
2835 3151
2836 num_bytes = calculate_bytes_needed(root->fs_info->extent_root, 3152 data_sinfo->force_alloc = 1;
2837 num_items); 3153 spin_unlock(&data_sinfo->lock);
3154alloc:
3155 alloc_target = btrfs_get_alloc_profile(root, 1);
3156 trans = btrfs_join_transaction(root, 1);
3157 if (IS_ERR(trans))
3158 return PTR_ERR(trans);
2838 3159
2839 spin_lock(&meta_sinfo->lock); 3160 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2840 spin_lock(&BTRFS_I(inode)->accounting_lock); 3161 bytes + 2 * 1024 * 1024,
2841 if (BTRFS_I(inode)->reserved_extents <= 3162 alloc_target, 0);
2842 BTRFS_I(inode)->outstanding_extents) { 3163 btrfs_end_transaction(trans, root);
2843 spin_unlock(&BTRFS_I(inode)->accounting_lock); 3164 if (ret < 0) {
2844 spin_unlock(&meta_sinfo->lock); 3165 if (ret != -ENOSPC)
2845 return 0; 3166 return ret;
2846 } 3167 else
2847 spin_unlock(&BTRFS_I(inode)->accounting_lock); 3168 goto commit_trans;
3169 }
2848 3170
2849 BTRFS_I(inode)->reserved_extents--; 3171 if (!data_sinfo) {
2850 BUG_ON(BTRFS_I(inode)->reserved_extents < 0); 3172 btrfs_set_inode_space_info(root, inode);
3173 data_sinfo = BTRFS_I(inode)->space_info;
3174 }
3175 goto again;
3176 }
3177 spin_unlock(&data_sinfo->lock);
2851 3178
2852 if (meta_sinfo->bytes_delalloc < num_bytes) { 3179 /* commit the current transaction and try again */
2853 bug = true; 3180commit_trans:
2854 meta_sinfo->bytes_delalloc = 0; 3181 if (!committed && !root->fs_info->open_ioctl_trans) {
2855 } else { 3182 committed = 1;
2856 meta_sinfo->bytes_delalloc -= num_bytes; 3183 trans = btrfs_join_transaction(root, 1);
2857 } 3184 if (IS_ERR(trans))
2858 spin_unlock(&meta_sinfo->lock); 3185 return PTR_ERR(trans);
3186 ret = btrfs_commit_transaction(trans, root);
3187 if (ret)
3188 return ret;
3189 goto again;
3190 }
2859 3191
2860 BUG_ON(bug); 3192#if 0 /* I hope we never need this code again, just in case */
3193 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
3194 "%llu bytes_reserved, " "%llu bytes_pinned, "
3195 "%llu bytes_readonly, %llu may use %llu total\n",
3196 (unsigned long long)bytes,
3197 (unsigned long long)data_sinfo->bytes_used,
3198 (unsigned long long)data_sinfo->bytes_reserved,
3199 (unsigned long long)data_sinfo->bytes_pinned,
3200 (unsigned long long)data_sinfo->bytes_readonly,
3201 (unsigned long long)data_sinfo->bytes_may_use,
3202 (unsigned long long)data_sinfo->total_bytes);
3203#endif
3204 return -ENOSPC;
3205 }
3206 data_sinfo->bytes_may_use += bytes;
3207 BTRFS_I(inode)->reserved_bytes += bytes;
3208 spin_unlock(&data_sinfo->lock);
2861 3209
2862 return 0; 3210 return 0;
2863} 3211}
2864 3212
2865static void check_force_delalloc(struct btrfs_space_info *meta_sinfo) 3213/*
3214 * called when we are clearing an delalloc extent from the
3215 * inode's io_tree or there was an error for whatever reason
3216 * after calling btrfs_check_data_free_space
3217 */
3218void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
2866{ 3219{
2867 u64 thresh; 3220 struct btrfs_root *root = BTRFS_I(inode)->root;
3221 struct btrfs_space_info *data_sinfo;
2868 3222
2869 thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 3223 /* make sure bytes are sectorsize aligned */
2870 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 3224 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2871 meta_sinfo->bytes_super + meta_sinfo->bytes_root +
2872 meta_sinfo->bytes_may_use;
2873 3225
2874 thresh = meta_sinfo->total_bytes - thresh; 3226 data_sinfo = BTRFS_I(inode)->space_info;
2875 thresh *= 80; 3227 spin_lock(&data_sinfo->lock);
2876 do_div(thresh, 100); 3228 data_sinfo->bytes_may_use -= bytes;
2877 if (thresh <= meta_sinfo->bytes_delalloc) 3229 BTRFS_I(inode)->reserved_bytes -= bytes;
2878 meta_sinfo->force_delalloc = 1; 3230 spin_unlock(&data_sinfo->lock);
2879 else
2880 meta_sinfo->force_delalloc = 0;
2881} 3231}
2882 3232
2883struct async_flush { 3233static void force_metadata_allocation(struct btrfs_fs_info *info)
2884 struct btrfs_root *root; 3234{
2885 struct btrfs_space_info *info; 3235 struct list_head *head = &info->space_info;
2886 struct btrfs_work work; 3236 struct btrfs_space_info *found;
2887}; 3237
3238 rcu_read_lock();
3239 list_for_each_entry_rcu(found, head, list) {
3240 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3241 found->force_alloc = 1;
3242 }
3243 rcu_read_unlock();
3244}
2888 3245
2889static noinline void flush_delalloc_async(struct btrfs_work *work) 3246static int should_alloc_chunk(struct btrfs_root *root,
3247 struct btrfs_space_info *sinfo, u64 alloc_bytes)
2890{ 3248{
2891 struct async_flush *async; 3249 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
2892 struct btrfs_root *root; 3250 u64 thresh;
2893 struct btrfs_space_info *info;
2894 3251
2895 async = container_of(work, struct async_flush, work); 3252 if (sinfo->bytes_used + sinfo->bytes_reserved +
2896 root = async->root; 3253 alloc_bytes + 256 * 1024 * 1024 < num_bytes)
2897 info = async->info; 3254 return 0;
2898 3255
2899 btrfs_start_delalloc_inodes(root, 0); 3256 if (sinfo->bytes_used + sinfo->bytes_reserved +
2900 wake_up(&info->flush_wait); 3257 alloc_bytes < div_factor(num_bytes, 8))
2901 btrfs_wait_ordered_extents(root, 0, 0); 3258 return 0;
2902 3259
2903 spin_lock(&info->lock); 3260 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
2904 info->flushing = 0; 3261 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
2905 spin_unlock(&info->lock);
2906 wake_up(&info->flush_wait);
2907 3262
2908 kfree(async); 3263 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3264 return 0;
3265
3266 return 1;
2909} 3267}
2910 3268
2911static void wait_on_flush(struct btrfs_space_info *info) 3269static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3270 struct btrfs_root *extent_root, u64 alloc_bytes,
3271 u64 flags, int force)
2912{ 3272{
2913 DEFINE_WAIT(wait); 3273 struct btrfs_space_info *space_info;
2914 u64 used; 3274 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3275 int ret = 0;
2915 3276
2916 while (1) { 3277 mutex_lock(&fs_info->chunk_mutex);
2917 prepare_to_wait(&info->flush_wait, &wait,
2918 TASK_UNINTERRUPTIBLE);
2919 spin_lock(&info->lock);
2920 if (!info->flushing) {
2921 spin_unlock(&info->lock);
2922 break;
2923 }
2924 3278
2925 used = info->bytes_used + info->bytes_reserved + 3279 flags = btrfs_reduce_alloc_profile(extent_root, flags);
2926 info->bytes_pinned + info->bytes_readonly + 3280
2927 info->bytes_super + info->bytes_root + 3281 space_info = __find_space_info(extent_root->fs_info, flags);
2928 info->bytes_may_use + info->bytes_delalloc; 3282 if (!space_info) {
2929 if (used < info->total_bytes) { 3283 ret = update_space_info(extent_root->fs_info, flags,
2930 spin_unlock(&info->lock); 3284 0, 0, &space_info);
2931 break; 3285 BUG_ON(ret);
2932 } 3286 }
2933 spin_unlock(&info->lock); 3287 BUG_ON(!space_info);
2934 schedule(); 3288
3289 spin_lock(&space_info->lock);
3290 if (space_info->force_alloc)
3291 force = 1;
3292 if (space_info->full) {
3293 spin_unlock(&space_info->lock);
3294 goto out;
3295 }
3296
3297 if (!force && !should_alloc_chunk(extent_root, space_info,
3298 alloc_bytes)) {
3299 spin_unlock(&space_info->lock);
3300 goto out;
3301 }
3302 spin_unlock(&space_info->lock);
3303
3304 /*
3305 * If we have mixed data/metadata chunks we want to make sure we keep
3306 * allocating mixed chunks instead of individual chunks.
3307 */
3308 if (btrfs_mixed_space_info(space_info))
3309 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3310
3311 /*
3312 * if we're doing a data chunk, go ahead and make sure that
3313 * we keep a reasonable number of metadata chunks allocated in the
3314 * FS as well.
3315 */
3316 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3317 fs_info->data_chunk_allocations++;
3318 if (!(fs_info->data_chunk_allocations %
3319 fs_info->metadata_ratio))
3320 force_metadata_allocation(fs_info);
2935 } 3321 }
2936 finish_wait(&info->flush_wait, &wait); 3322
3323 ret = btrfs_alloc_chunk(trans, extent_root, flags);
3324 spin_lock(&space_info->lock);
3325 if (ret)
3326 space_info->full = 1;
3327 else
3328 ret = 1;
3329 space_info->force_alloc = 0;
3330 spin_unlock(&space_info->lock);
3331out:
3332 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3333 return ret;
2937} 3334}
2938 3335
2939static void flush_delalloc(struct btrfs_root *root, 3336/*
2940 struct btrfs_space_info *info) 3337 * shrink metadata reservation for delalloc
3338 */
3339static int shrink_delalloc(struct btrfs_trans_handle *trans,
3340 struct btrfs_root *root, u64 to_reclaim, int sync)
2941{ 3341{
2942 struct async_flush *async; 3342 struct btrfs_block_rsv *block_rsv;
2943 bool wait = false; 3343 struct btrfs_space_info *space_info;
3344 u64 reserved;
3345 u64 max_reclaim;
3346 u64 reclaimed = 0;
3347 int pause = 1;
3348 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
2944 3349
2945 spin_lock(&info->lock); 3350 block_rsv = &root->fs_info->delalloc_block_rsv;
3351 space_info = block_rsv->space_info;
2946 3352
2947 if (!info->flushing) { 3353 smp_mb();
2948 info->flushing = 1; 3354 reserved = space_info->bytes_reserved;
2949 init_waitqueue_head(&info->flush_wait);
2950 } else {
2951 wait = true;
2952 }
2953 3355
2954 spin_unlock(&info->lock); 3356 if (reserved == 0)
3357 return 0;
2955 3358
2956 if (wait) { 3359 max_reclaim = min(reserved, to_reclaim);
2957 wait_on_flush(info);
2958 return;
2959 }
2960 3360
2961 async = kzalloc(sizeof(*async), GFP_NOFS); 3361 while (1) {
2962 if (!async) 3362 /* have the flusher threads jump in and do some IO */
2963 goto flush; 3363 smp_mb();
3364 nr_pages = min_t(unsigned long, nr_pages,
3365 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3366 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3367
3368 spin_lock(&space_info->lock);
3369 if (reserved > space_info->bytes_reserved)
3370 reclaimed += reserved - space_info->bytes_reserved;
3371 reserved = space_info->bytes_reserved;
3372 spin_unlock(&space_info->lock);
2964 3373
2965 async->root = root; 3374 if (reserved == 0 || reclaimed >= max_reclaim)
2966 async->info = info; 3375 break;
2967 async->work.func = flush_delalloc_async;
2968 3376
2969 btrfs_queue_worker(&root->fs_info->enospc_workers, 3377 if (trans && trans->transaction->blocked)
2970 &async->work); 3378 return -EAGAIN;
2971 wait_on_flush(info);
2972 return;
2973 3379
2974flush: 3380 __set_current_state(TASK_INTERRUPTIBLE);
2975 btrfs_start_delalloc_inodes(root, 0); 3381 schedule_timeout(pause);
2976 btrfs_wait_ordered_extents(root, 0, 0); 3382 pause <<= 1;
3383 if (pause > HZ / 10)
3384 pause = HZ / 10;
2977 3385
2978 spin_lock(&info->lock); 3386 }
2979 info->flushing = 0; 3387 return reclaimed >= to_reclaim;
2980 spin_unlock(&info->lock);
2981 wake_up(&info->flush_wait);
2982} 3388}
2983 3389
2984static int maybe_allocate_chunk(struct btrfs_root *root, 3390/*
2985 struct btrfs_space_info *info) 3391 * Retries tells us how many times we've called reserve_metadata_bytes. The
3392 * idea is if this is the first call (retries == 0) then we will add to our
3393 * reserved count if we can't make the allocation in order to hold our place
3394 * while we go and try and free up space. That way for retries > 1 we don't try
3395 * and add space, we just check to see if the amount of unused space is >= the
3396 * total space, meaning that our reservation is valid.
3397 *
3398 * However if we don't intend to retry this reservation, pass -1 as retries so
3399 * that it short circuits this logic.
3400 */
3401static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3402 struct btrfs_root *root,
3403 struct btrfs_block_rsv *block_rsv,
3404 u64 orig_bytes, int flush)
2986{ 3405{
2987 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 3406 struct btrfs_space_info *space_info = block_rsv->space_info;
2988 struct btrfs_trans_handle *trans; 3407 u64 unused;
2989 bool wait = false; 3408 u64 num_bytes = orig_bytes;
3409 int retries = 0;
2990 int ret = 0; 3410 int ret = 0;
2991 u64 min_metadata; 3411 bool reserved = false;
2992 u64 free_space; 3412 bool committed = false;
3413
3414again:
3415 ret = -ENOSPC;
3416 if (reserved)
3417 num_bytes = 0;
3418
3419 spin_lock(&space_info->lock);
3420 unused = space_info->bytes_used + space_info->bytes_reserved +
3421 space_info->bytes_pinned + space_info->bytes_readonly +
3422 space_info->bytes_may_use;
2993 3423
2994 free_space = btrfs_super_total_bytes(disk_super);
2995 /* 3424 /*
2996 * we allow the metadata to grow to a max of either 10gb or 5% of the 3425 * The idea here is that we've not already over-reserved the block group
2997 * space in the volume. 3426 * then we can go ahead and save our reservation first and then start
3427 * flushing if we need to. Otherwise if we've already overcommitted
3428 * lets start flushing stuff first and then come back and try to make
3429 * our reservation.
2998 */ 3430 */
2999 min_metadata = min((u64)10 * 1024 * 1024 * 1024, 3431 if (unused <= space_info->total_bytes) {
3000 div64_u64(free_space * 5, 100)); 3432 unused = space_info->total_bytes - unused;
3001 if (info->total_bytes >= min_metadata) { 3433 if (unused >= num_bytes) {
3002 spin_unlock(&info->lock); 3434 if (!reserved)
3003 return 0; 3435 space_info->bytes_reserved += orig_bytes;
3436 ret = 0;
3437 } else {
3438 /*
3439 * Ok set num_bytes to orig_bytes since we aren't
3440 * overocmmitted, this way we only try and reclaim what
3441 * we need.
3442 */
3443 num_bytes = orig_bytes;
3444 }
3445 } else {
3446 /*
3447 * Ok we're over committed, set num_bytes to the overcommitted
3448 * amount plus the amount of bytes that we need for this
3449 * reservation.
3450 */
3451 num_bytes = unused - space_info->total_bytes +
3452 (orig_bytes * (retries + 1));
3004 } 3453 }
3005 3454
3006 if (info->full) { 3455 /*
3007 spin_unlock(&info->lock); 3456 * Couldn't make our reservation, save our place so while we're trying
3008 return 0; 3457 * to reclaim space we can actually use it instead of somebody else
3458 * stealing it from us.
3459 */
3460 if (ret && !reserved) {
3461 space_info->bytes_reserved += orig_bytes;
3462 reserved = true;
3009 } 3463 }
3010 3464
3011 if (!info->allocating_chunk) { 3465 spin_unlock(&space_info->lock);
3012 info->force_alloc = 1;
3013 info->allocating_chunk = 1;
3014 init_waitqueue_head(&info->allocate_wait);
3015 } else {
3016 wait = true;
3017 }
3018 3466
3019 spin_unlock(&info->lock); 3467 if (!ret)
3468 return 0;
3020 3469
3021 if (wait) { 3470 if (!flush)
3022 wait_event(info->allocate_wait, 3471 goto out;
3023 !info->allocating_chunk);
3024 return 1;
3025 }
3026 3472
3027 trans = btrfs_start_transaction(root, 1); 3473 /*
3028 if (!trans) { 3474 * We do synchronous shrinking since we don't actually unreserve
3029 ret = -ENOMEM; 3475 * metadata until after the IO is completed.
3476 */
3477 ret = shrink_delalloc(trans, root, num_bytes, 1);
3478 if (ret > 0)
3479 return 0;
3480 else if (ret < 0)
3030 goto out; 3481 goto out;
3482
3483 /*
3484 * So if we were overcommitted it's possible that somebody else flushed
3485 * out enough space and we simply didn't have enough space to reclaim,
3486 * so go back around and try again.
3487 */
3488 if (retries < 2) {
3489 retries++;
3490 goto again;
3031 } 3491 }
3032 3492
3033 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3493 spin_lock(&space_info->lock);
3034 4096 + 2 * 1024 * 1024, 3494 /*
3035 info->flags, 0); 3495 * Not enough space to be reclaimed, don't bother committing the
3036 btrfs_end_transaction(trans, root); 3496 * transaction.
3497 */
3498 if (space_info->bytes_pinned < orig_bytes)
3499 ret = -ENOSPC;
3500 spin_unlock(&space_info->lock);
3037 if (ret) 3501 if (ret)
3038 goto out; 3502 goto out;
3503
3504 ret = -EAGAIN;
3505 if (trans || committed)
3506 goto out;
3507
3508 ret = -ENOSPC;
3509 trans = btrfs_join_transaction(root, 1);
3510 if (IS_ERR(trans))
3511 goto out;
3512 ret = btrfs_commit_transaction(trans, root);
3513 if (!ret) {
3514 trans = NULL;
3515 committed = true;
3516 goto again;
3517 }
3518
3039out: 3519out:
3040 spin_lock(&info->lock); 3520 if (reserved) {
3041 info->allocating_chunk = 0; 3521 spin_lock(&space_info->lock);
3042 spin_unlock(&info->lock); 3522 space_info->bytes_reserved -= orig_bytes;
3043 wake_up(&info->allocate_wait); 3523 spin_unlock(&space_info->lock);
3524 }
3044 3525
3045 if (ret) 3526 return ret;
3046 return 0;
3047 return 1;
3048} 3527}
3049 3528
3050/* 3529static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3051 * Reserve metadata space for delalloc. 3530 struct btrfs_root *root)
3052 */
3053int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
3054 struct inode *inode, int num_items)
3055{ 3531{
3056 struct btrfs_fs_info *info = root->fs_info; 3532 struct btrfs_block_rsv *block_rsv;
3057 struct btrfs_space_info *meta_sinfo; 3533 if (root->ref_cows)
3058 u64 num_bytes; 3534 block_rsv = trans->block_rsv;
3059 u64 used; 3535 else
3060 u64 alloc_target; 3536 block_rsv = root->block_rsv;
3061 int flushed = 0;
3062 int force_delalloc;
3063
3064 /* get the space info for where the metadata will live */
3065 alloc_target = btrfs_get_alloc_profile(root, 0);
3066 meta_sinfo = __find_space_info(info, alloc_target);
3067 3537
3068 num_bytes = calculate_bytes_needed(root->fs_info->extent_root, 3538 if (!block_rsv)
3069 num_items); 3539 block_rsv = &root->fs_info->empty_block_rsv;
3070again:
3071 spin_lock(&meta_sinfo->lock);
3072 3540
3073 force_delalloc = meta_sinfo->force_delalloc; 3541 return block_rsv;
3542}
3074 3543
3075 if (unlikely(!meta_sinfo->bytes_root)) 3544static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
3076 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); 3545 u64 num_bytes)
3546{
3547 int ret = -ENOSPC;
3548 spin_lock(&block_rsv->lock);
3549 if (block_rsv->reserved >= num_bytes) {
3550 block_rsv->reserved -= num_bytes;
3551 if (block_rsv->reserved < block_rsv->size)
3552 block_rsv->full = 0;
3553 ret = 0;
3554 }
3555 spin_unlock(&block_rsv->lock);
3556 return ret;
3557}
3077 3558
3078 if (!flushed) 3559static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3079 meta_sinfo->bytes_delalloc += num_bytes; 3560 u64 num_bytes, int update_size)
3561{
3562 spin_lock(&block_rsv->lock);
3563 block_rsv->reserved += num_bytes;
3564 if (update_size)
3565 block_rsv->size += num_bytes;
3566 else if (block_rsv->reserved >= block_rsv->size)
3567 block_rsv->full = 1;
3568 spin_unlock(&block_rsv->lock);
3569}
3080 3570
3081 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 3571void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3082 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 3572 struct btrfs_block_rsv *dest, u64 num_bytes)
3083 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 3573{
3084 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; 3574 struct btrfs_space_info *space_info = block_rsv->space_info;
3085 3575
3086 if (used > meta_sinfo->total_bytes) { 3576 spin_lock(&block_rsv->lock);
3087 flushed++; 3577 if (num_bytes == (u64)-1)
3578 num_bytes = block_rsv->size;
3579 block_rsv->size -= num_bytes;
3580 if (block_rsv->reserved >= block_rsv->size) {
3581 num_bytes = block_rsv->reserved - block_rsv->size;
3582 block_rsv->reserved = block_rsv->size;
3583 block_rsv->full = 1;
3584 } else {
3585 num_bytes = 0;
3586 }
3587 spin_unlock(&block_rsv->lock);
3088 3588
3089 if (flushed == 1) { 3589 if (num_bytes > 0) {
3090 if (maybe_allocate_chunk(root, meta_sinfo)) 3590 if (dest) {
3091 goto again; 3591 block_rsv_add_bytes(dest, num_bytes, 0);
3092 flushed++;
3093 } else { 3592 } else {
3094 spin_unlock(&meta_sinfo->lock); 3593 spin_lock(&space_info->lock);
3095 } 3594 space_info->bytes_reserved -= num_bytes;
3096 3595 spin_unlock(&space_info->lock);
3097 if (flushed == 2) {
3098 filemap_flush(inode->i_mapping);
3099 goto again;
3100 } else if (flushed == 3) {
3101 flush_delalloc(root, meta_sinfo);
3102 goto again;
3103 } 3596 }
3104 spin_lock(&meta_sinfo->lock);
3105 meta_sinfo->bytes_delalloc -= num_bytes;
3106 spin_unlock(&meta_sinfo->lock);
3107 printk(KERN_ERR "enospc, has %d, reserved %d\n",
3108 BTRFS_I(inode)->outstanding_extents,
3109 BTRFS_I(inode)->reserved_extents);
3110 dump_space_info(meta_sinfo, 0, 0);
3111 return -ENOSPC;
3112 } 3597 }
3598}
3113 3599
3114 BTRFS_I(inode)->reserved_extents++; 3600static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
3115 check_force_delalloc(meta_sinfo); 3601 struct btrfs_block_rsv *dst, u64 num_bytes)
3116 spin_unlock(&meta_sinfo->lock); 3602{
3603 int ret;
3117 3604
3118 if (!flushed && force_delalloc) 3605 ret = block_rsv_use_bytes(src, num_bytes);
3119 filemap_flush(inode->i_mapping); 3606 if (ret)
3607 return ret;
3120 3608
3609 block_rsv_add_bytes(dst, num_bytes, 1);
3121 return 0; 3610 return 0;
3122} 3611}
3123 3612
3124/* 3613void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3125 * unreserve num_items number of items worth of metadata space. This needs to
3126 * be paired with btrfs_reserve_metadata_space.
3127 *
3128 * NOTE: if you have the option, run this _AFTER_ you do a
3129 * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
3130 * oprations which will result in more used metadata, so we want to make sure we
3131 * can do that without issue.
3132 */
3133int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
3134{ 3614{
3135 struct btrfs_fs_info *info = root->fs_info; 3615 memset(rsv, 0, sizeof(*rsv));
3136 struct btrfs_space_info *meta_sinfo; 3616 spin_lock_init(&rsv->lock);
3137 u64 num_bytes; 3617 atomic_set(&rsv->usage, 1);
3138 u64 alloc_target; 3618 rsv->priority = 6;
3139 bool bug = false; 3619 INIT_LIST_HEAD(&rsv->list);
3140 3620}
3141 /* get the space info for where the metadata will live */
3142 alloc_target = btrfs_get_alloc_profile(root, 0);
3143 meta_sinfo = __find_space_info(info, alloc_target);
3144 3621
3145 num_bytes = calculate_bytes_needed(root, num_items); 3622struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3623{
3624 struct btrfs_block_rsv *block_rsv;
3625 struct btrfs_fs_info *fs_info = root->fs_info;
3146 3626
3147 spin_lock(&meta_sinfo->lock); 3627 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3148 if (meta_sinfo->bytes_may_use < num_bytes) { 3628 if (!block_rsv)
3149 bug = true; 3629 return NULL;
3150 meta_sinfo->bytes_may_use = 0;
3151 } else {
3152 meta_sinfo->bytes_may_use -= num_bytes;
3153 }
3154 spin_unlock(&meta_sinfo->lock);
3155 3630
3156 BUG_ON(bug); 3631 btrfs_init_block_rsv(block_rsv);
3632 block_rsv->space_info = __find_space_info(fs_info,
3633 BTRFS_BLOCK_GROUP_METADATA);
3634 return block_rsv;
3635}
3157 3636
3158 return 0; 3637void btrfs_free_block_rsv(struct btrfs_root *root,
3638 struct btrfs_block_rsv *rsv)
3639{
3640 if (rsv && atomic_dec_and_test(&rsv->usage)) {
3641 btrfs_block_rsv_release(root, rsv, (u64)-1);
3642 if (!rsv->durable)
3643 kfree(rsv);
3644 }
3159} 3645}
3160 3646
3161/* 3647/*
3162 * Reserve some metadata space for use. We'll calculate the worste case number 3648 * make the block_rsv struct be able to capture freed space.
3163 * of bytes that would be needed to modify num_items number of items. If we 3649 * the captured space will re-add to the the block_rsv struct
3164 * have space, fantastic, if not, you get -ENOSPC. Please call 3650 * after transaction commit
3165 * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
3166 * items you reserved, since whatever metadata you needed should have already
3167 * been allocated.
3168 *
3169 * This will commit the transaction to make more space if we don't have enough
3170 * metadata space. THe only time we don't do this is if we're reserving space
3171 * inside of a transaction, then we will just return -ENOSPC and it is the
3172 * callers responsibility to handle it properly.
3173 */ 3651 */
3174int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items) 3652void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3653 struct btrfs_block_rsv *block_rsv)
3175{ 3654{
3176 struct btrfs_fs_info *info = root->fs_info; 3655 block_rsv->durable = 1;
3177 struct btrfs_space_info *meta_sinfo; 3656 mutex_lock(&fs_info->durable_block_rsv_mutex);
3178 u64 num_bytes; 3657 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3179 u64 used; 3658 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3180 u64 alloc_target; 3659}
3181 int retries = 0;
3182 3660
3183 /* get the space info for where the metadata will live */ 3661int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3184 alloc_target = btrfs_get_alloc_profile(root, 0); 3662 struct btrfs_root *root,
3185 meta_sinfo = __find_space_info(info, alloc_target); 3663 struct btrfs_block_rsv *block_rsv,
3664 u64 num_bytes)
3665{
3666 int ret;
3186 3667
3187 num_bytes = calculate_bytes_needed(root, num_items); 3668 if (num_bytes == 0)
3188again: 3669 return 0;
3189 spin_lock(&meta_sinfo->lock);
3190 3670
3191 if (unlikely(!meta_sinfo->bytes_root)) 3671 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
3192 meta_sinfo->bytes_root = calculate_bytes_needed(root, 6); 3672 if (!ret) {
3673 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3674 return 0;
3675 }
3193 3676
3194 if (!retries) 3677 return ret;
3195 meta_sinfo->bytes_may_use += num_bytes; 3678}
3196 3679
3197 used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + 3680int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3198 meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly + 3681 struct btrfs_root *root,
3199 meta_sinfo->bytes_super + meta_sinfo->bytes_root + 3682 struct btrfs_block_rsv *block_rsv,
3200 meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc; 3683 u64 min_reserved, int min_factor)
3684{
3685 u64 num_bytes = 0;
3686 int commit_trans = 0;
3687 int ret = -ENOSPC;
3201 3688
3202 if (used > meta_sinfo->total_bytes) { 3689 if (!block_rsv)
3203 retries++; 3690 return 0;
3204 if (retries == 1) {
3205 if (maybe_allocate_chunk(root, meta_sinfo))
3206 goto again;
3207 retries++;
3208 } else {
3209 spin_unlock(&meta_sinfo->lock);
3210 }
3211 3691
3212 if (retries == 2) { 3692 spin_lock(&block_rsv->lock);
3213 flush_delalloc(root, meta_sinfo); 3693 if (min_factor > 0)
3214 goto again; 3694 num_bytes = div_factor(block_rsv->size, min_factor);
3695 if (min_reserved > num_bytes)
3696 num_bytes = min_reserved;
3697
3698 if (block_rsv->reserved >= num_bytes) {
3699 ret = 0;
3700 } else {
3701 num_bytes -= block_rsv->reserved;
3702 if (block_rsv->durable &&
3703 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3704 commit_trans = 1;
3705 }
3706 spin_unlock(&block_rsv->lock);
3707 if (!ret)
3708 return 0;
3709
3710 if (block_rsv->refill_used) {
3711 ret = reserve_metadata_bytes(trans, root, block_rsv,
3712 num_bytes, 0);
3713 if (!ret) {
3714 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3715 return 0;
3215 } 3716 }
3216 spin_lock(&meta_sinfo->lock); 3717 }
3217 meta_sinfo->bytes_may_use -= num_bytes;
3218 spin_unlock(&meta_sinfo->lock);
3219 3718
3220 dump_space_info(meta_sinfo, 0, 0); 3719 if (commit_trans) {
3221 return -ENOSPC; 3720 if (trans)
3721 return -EAGAIN;
3722
3723 trans = btrfs_join_transaction(root, 1);
3724 BUG_ON(IS_ERR(trans));
3725 ret = btrfs_commit_transaction(trans, root);
3726 return 0;
3222 } 3727 }
3223 3728
3224 check_force_delalloc(meta_sinfo); 3729 return -ENOSPC;
3225 spin_unlock(&meta_sinfo->lock); 3730}
3226 3731
3227 return 0; 3732int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
3733 struct btrfs_block_rsv *dst_rsv,
3734 u64 num_bytes)
3735{
3736 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3737}
3738
3739void btrfs_block_rsv_release(struct btrfs_root *root,
3740 struct btrfs_block_rsv *block_rsv,
3741 u64 num_bytes)
3742{
3743 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3744 if (global_rsv->full || global_rsv == block_rsv ||
3745 block_rsv->space_info != global_rsv->space_info)
3746 global_rsv = NULL;
3747 block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
3228} 3748}
3229 3749
3230/* 3750/*
3231 * This will check the space that the inode allocates from to make sure we have 3751 * helper to calculate size of global block reservation.
3232 * enough space for bytes. 3752 * the desired value is sum of space used by extent tree,
3753 * checksum tree and root tree
3233 */ 3754 */
3234int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, 3755static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3235 u64 bytes)
3236{ 3756{
3237 struct btrfs_space_info *data_sinfo; 3757 struct btrfs_space_info *sinfo;
3238 int ret = 0, committed = 0; 3758 u64 num_bytes;
3759 u64 meta_used;
3760 u64 data_used;
3761 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3762#if 0
3763 /*
3764 * per tree used space accounting can be inaccuracy, so we
3765 * can't rely on it.
3766 */
3767 spin_lock(&fs_info->extent_root->accounting_lock);
3768 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3769 spin_unlock(&fs_info->extent_root->accounting_lock);
3239 3770
3240 /* make sure bytes are sectorsize aligned */ 3771 spin_lock(&fs_info->csum_root->accounting_lock);
3241 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3772 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3773 spin_unlock(&fs_info->csum_root->accounting_lock);
3242 3774
3243 data_sinfo = BTRFS_I(inode)->space_info; 3775 spin_lock(&fs_info->tree_root->accounting_lock);
3244 if (!data_sinfo) 3776 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3245 goto alloc; 3777 spin_unlock(&fs_info->tree_root->accounting_lock);
3778#endif
3779 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3780 spin_lock(&sinfo->lock);
3781 data_used = sinfo->bytes_used;
3782 spin_unlock(&sinfo->lock);
3246 3783
3247again: 3784 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3248 /* make sure we have enough space to handle the data first */ 3785 spin_lock(&sinfo->lock);
3249 spin_lock(&data_sinfo->lock); 3786 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3250 if (data_sinfo->total_bytes - data_sinfo->bytes_used - 3787 data_used = 0;
3251 data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - 3788 meta_used = sinfo->bytes_used;
3252 data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - 3789 spin_unlock(&sinfo->lock);
3253 data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) {
3254 struct btrfs_trans_handle *trans;
3255 3790
3256 /* 3791 num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
3257 * if we don't have enough free bytes in this space then we need 3792 csum_size * 2;
3258 * to alloc a new chunk. 3793 num_bytes += div64_u64(data_used + meta_used, 50);
3259 */
3260 if (!data_sinfo->full) {
3261 u64 alloc_target;
3262 3794
3263 data_sinfo->force_alloc = 1; 3795 if (num_bytes * 3 > meta_used)
3264 spin_unlock(&data_sinfo->lock); 3796 num_bytes = div64_u64(meta_used, 3);
3265alloc:
3266 alloc_target = btrfs_get_alloc_profile(root, 1);
3267 trans = btrfs_start_transaction(root, 1);
3268 if (!trans)
3269 return -ENOMEM;
3270 3797
3271 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3798 return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
3272 bytes + 2 * 1024 * 1024, 3799}
3273 alloc_target, 0);
3274 btrfs_end_transaction(trans, root);
3275 if (ret)
3276 return ret;
3277 3800
3278 if (!data_sinfo) { 3801static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3279 btrfs_set_inode_space_info(root, inode); 3802{
3280 data_sinfo = BTRFS_I(inode)->space_info; 3803 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3281 } 3804 struct btrfs_space_info *sinfo = block_rsv->space_info;
3282 goto again; 3805 u64 num_bytes;
3283 }
3284 spin_unlock(&data_sinfo->lock);
3285 3806
3286 /* commit the current transaction and try again */ 3807 num_bytes = calc_global_metadata_size(fs_info);
3287 if (!committed && !root->fs_info->open_ioctl_trans) {
3288 committed = 1;
3289 trans = btrfs_join_transaction(root, 1);
3290 if (!trans)
3291 return -ENOMEM;
3292 ret = btrfs_commit_transaction(trans, root);
3293 if (ret)
3294 return ret;
3295 goto again;
3296 }
3297 3808
3298 printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" 3809 spin_lock(&block_rsv->lock);
3299 ", %llu bytes_used, %llu bytes_reserved, " 3810 spin_lock(&sinfo->lock);
3300 "%llu bytes_pinned, %llu bytes_readonly, %llu may use " 3811
3301 "%llu total\n", (unsigned long long)bytes, 3812 block_rsv->size = num_bytes;
3302 (unsigned long long)data_sinfo->bytes_delalloc, 3813
3303 (unsigned long long)data_sinfo->bytes_used, 3814 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3304 (unsigned long long)data_sinfo->bytes_reserved, 3815 sinfo->bytes_reserved + sinfo->bytes_readonly +
3305 (unsigned long long)data_sinfo->bytes_pinned, 3816 sinfo->bytes_may_use;
3306 (unsigned long long)data_sinfo->bytes_readonly, 3817
3307 (unsigned long long)data_sinfo->bytes_may_use, 3818 if (sinfo->total_bytes > num_bytes) {
3308 (unsigned long long)data_sinfo->total_bytes); 3819 num_bytes = sinfo->total_bytes - num_bytes;
3309 return -ENOSPC; 3820 block_rsv->reserved += num_bytes;
3821 sinfo->bytes_reserved += num_bytes;
3310 } 3822 }
3311 data_sinfo->bytes_may_use += bytes;
3312 BTRFS_I(inode)->reserved_bytes += bytes;
3313 spin_unlock(&data_sinfo->lock);
3314 3823
3315 return 0; 3824 if (block_rsv->reserved >= block_rsv->size) {
3825 num_bytes = block_rsv->reserved - block_rsv->size;
3826 sinfo->bytes_reserved -= num_bytes;
3827 block_rsv->reserved = block_rsv->size;
3828 block_rsv->full = 1;
3829 }
3830#if 0
3831 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3832 block_rsv->size, block_rsv->reserved);
3833#endif
3834 spin_unlock(&sinfo->lock);
3835 spin_unlock(&block_rsv->lock);
3316} 3836}
3317 3837
3318/* 3838static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3319 * if there was an error for whatever reason after calling
3320 * btrfs_check_data_free_space, call this so we can cleanup the counters.
3321 */
3322void btrfs_free_reserved_data_space(struct btrfs_root *root,
3323 struct inode *inode, u64 bytes)
3324{ 3839{
3325 struct btrfs_space_info *data_sinfo; 3840 struct btrfs_space_info *space_info;
3326 3841
3327 /* make sure bytes are sectorsize aligned */ 3842 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3328 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3843 fs_info->chunk_block_rsv.space_info = space_info;
3844 fs_info->chunk_block_rsv.priority = 10;
3329 3845
3330 data_sinfo = BTRFS_I(inode)->space_info; 3846 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3331 spin_lock(&data_sinfo->lock); 3847 fs_info->global_block_rsv.space_info = space_info;
3332 data_sinfo->bytes_may_use -= bytes; 3848 fs_info->global_block_rsv.priority = 10;
3333 BTRFS_I(inode)->reserved_bytes -= bytes; 3849 fs_info->global_block_rsv.refill_used = 1;
3334 spin_unlock(&data_sinfo->lock); 3850 fs_info->delalloc_block_rsv.space_info = space_info;
3851 fs_info->trans_block_rsv.space_info = space_info;
3852 fs_info->empty_block_rsv.space_info = space_info;
3853 fs_info->empty_block_rsv.priority = 10;
3854
3855 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3856 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
3857 fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
3858 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3859 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3860
3861 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3862
3863 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3864
3865 update_global_block_rsv(fs_info);
3335} 3866}
3336 3867
3337/* called when we are adding a delalloc extent to the inode's io_tree */ 3868static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3338void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
3339 u64 bytes)
3340{ 3869{
3341 struct btrfs_space_info *data_sinfo; 3870 block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
3871 WARN_ON(fs_info->delalloc_block_rsv.size > 0);
3872 WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
3873 WARN_ON(fs_info->trans_block_rsv.size > 0);
3874 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3875 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3876 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3877}
3342 3878
3343 /* get the space info for where this inode will be storing its data */ 3879static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
3344 data_sinfo = BTRFS_I(inode)->space_info; 3880{
3881 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
3882 3 * num_items;
3883}
3345 3884
3346 /* make sure we have enough space to handle the data first */ 3885int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3347 spin_lock(&data_sinfo->lock); 3886 struct btrfs_root *root,
3348 data_sinfo->bytes_delalloc += bytes; 3887 int num_items)
3888{
3889 u64 num_bytes;
3890 int ret;
3349 3891
3350 /* 3892 if (num_items == 0 || root->fs_info->chunk_root == root)
3351 * we are adding a delalloc extent without calling 3893 return 0;
3352 * btrfs_check_data_free_space first. This happens on a weird 3894
3353 * writepage condition, but shouldn't hurt our accounting 3895 num_bytes = calc_trans_metadata_size(root, num_items);
3354 */ 3896 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3355 if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { 3897 num_bytes);
3356 data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; 3898 if (!ret) {
3357 BTRFS_I(inode)->reserved_bytes = 0; 3899 trans->bytes_reserved += num_bytes;
3358 } else { 3900 trans->block_rsv = &root->fs_info->trans_block_rsv;
3359 data_sinfo->bytes_may_use -= bytes;
3360 BTRFS_I(inode)->reserved_bytes -= bytes;
3361 } 3901 }
3902 return ret;
3903}
3362 3904
3363 spin_unlock(&data_sinfo->lock); 3905void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3906 struct btrfs_root *root)
3907{
3908 if (!trans->bytes_reserved)
3909 return;
3910
3911 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
3912 btrfs_block_rsv_release(root, trans->block_rsv,
3913 trans->bytes_reserved);
3914 trans->bytes_reserved = 0;
3364} 3915}
3365 3916
3366/* called when we are clearing an delalloc extent from the inode's io_tree */ 3917int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3367void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, 3918 struct inode *inode)
3368 u64 bytes)
3369{ 3919{
3370 struct btrfs_space_info *info; 3920 struct btrfs_root *root = BTRFS_I(inode)->root;
3921 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3922 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3371 3923
3372 info = BTRFS_I(inode)->space_info; 3924 /*
3925 * one for deleting orphan item, one for updating inode and
3926 * two for calling btrfs_truncate_inode_items.
3927 *
3928 * btrfs_truncate_inode_items is a delete operation, it frees
3929 * more space than it uses in most cases. So two units of
3930 * metadata space should be enough for calling it many times.
3931 * If all of the metadata space is used, we can commit
3932 * transaction and use space it freed.
3933 */
3934 u64 num_bytes = calc_trans_metadata_size(root, 4);
3935 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3936}
3373 3937
3374 spin_lock(&info->lock); 3938void btrfs_orphan_release_metadata(struct inode *inode)
3375 info->bytes_delalloc -= bytes; 3939{
3376 spin_unlock(&info->lock); 3940 struct btrfs_root *root = BTRFS_I(inode)->root;
3941 u64 num_bytes = calc_trans_metadata_size(root, 4);
3942 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3377} 3943}
3378 3944
3379static void force_metadata_allocation(struct btrfs_fs_info *info) 3945int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3946 struct btrfs_pending_snapshot *pending)
3380{ 3947{
3381 struct list_head *head = &info->space_info; 3948 struct btrfs_root *root = pending->root;
3382 struct btrfs_space_info *found; 3949 struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
3950 struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
3951 /*
3952 * two for root back/forward refs, two for directory entries
3953 * and one for root of the snapshot.
3954 */
3955 u64 num_bytes = calc_trans_metadata_size(root, 5);
3956 dst_rsv->space_info = src_rsv->space_info;
3957 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3958}
3383 3959
3384 rcu_read_lock(); 3960static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
3385 list_for_each_entry_rcu(found, head, list) { 3961{
3386 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3962 return num_bytes >>= 3;
3387 found->force_alloc = 1;
3388 }
3389 rcu_read_unlock();
3390} 3963}
3391 3964
3392static int do_chunk_alloc(struct btrfs_trans_handle *trans, 3965int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3393 struct btrfs_root *extent_root, u64 alloc_bytes,
3394 u64 flags, int force)
3395{ 3966{
3396 struct btrfs_space_info *space_info; 3967 struct btrfs_root *root = BTRFS_I(inode)->root;
3397 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3968 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3398 u64 thresh; 3969 u64 to_reserve;
3399 int ret = 0; 3970 int nr_extents;
3971 int ret;
3400 3972
3401 mutex_lock(&fs_info->chunk_mutex); 3973 if (btrfs_transaction_in_commit(root->fs_info))
3974 schedule_timeout(1);
3402 3975
3403 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3976 num_bytes = ALIGN(num_bytes, root->sectorsize);
3404 3977
3405 space_info = __find_space_info(extent_root->fs_info, flags); 3978 spin_lock(&BTRFS_I(inode)->accounting_lock);
3406 if (!space_info) { 3979 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3407 ret = update_space_info(extent_root->fs_info, flags, 3980 if (nr_extents > BTRFS_I(inode)->reserved_extents) {
3408 0, 0, &space_info); 3981 nr_extents -= BTRFS_I(inode)->reserved_extents;
3409 BUG_ON(ret); 3982 to_reserve = calc_trans_metadata_size(root, nr_extents);
3983 } else {
3984 nr_extents = 0;
3985 to_reserve = 0;
3410 } 3986 }
3411 BUG_ON(!space_info); 3987 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3412 3988
3413 spin_lock(&space_info->lock); 3989 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3414 if (space_info->force_alloc) 3990 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3415 force = 1; 3991 if (ret)
3416 if (space_info->full) { 3992 return ret;
3417 spin_unlock(&space_info->lock);
3418 goto out;
3419 }
3420 3993
3421 thresh = space_info->total_bytes - space_info->bytes_readonly; 3994 spin_lock(&BTRFS_I(inode)->accounting_lock);
3422 thresh = div_factor(thresh, 8); 3995 BTRFS_I(inode)->reserved_extents += nr_extents;
3423 if (!force && 3996 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3424 (space_info->bytes_used + space_info->bytes_pinned + 3997 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3425 space_info->bytes_reserved + alloc_bytes) < thresh) {
3426 spin_unlock(&space_info->lock);
3427 goto out;
3428 }
3429 spin_unlock(&space_info->lock);
3430 3998
3431 /* 3999 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3432 * if we're doing a data chunk, go ahead and make sure that 4000
3433 * we keep a reasonable number of metadata chunks allocated in the 4001 if (block_rsv->size > 512 * 1024 * 1024)
3434 * FS as well. 4002 shrink_delalloc(NULL, root, to_reserve, 0);
3435 */ 4003
3436 if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) { 4004 return 0;
3437 fs_info->data_chunk_allocations++; 4005}
3438 if (!(fs_info->data_chunk_allocations % 4006
3439 fs_info->metadata_ratio)) 4007void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3440 force_metadata_allocation(fs_info); 4008{
4009 struct btrfs_root *root = BTRFS_I(inode)->root;
4010 u64 to_free;
4011 int nr_extents;
4012
4013 num_bytes = ALIGN(num_bytes, root->sectorsize);
4014 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
4015
4016 spin_lock(&BTRFS_I(inode)->accounting_lock);
4017 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
4018 if (nr_extents < BTRFS_I(inode)->reserved_extents) {
4019 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
4020 BTRFS_I(inode)->reserved_extents -= nr_extents;
4021 } else {
4022 nr_extents = 0;
3441 } 4023 }
4024 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3442 4025
3443 ret = btrfs_alloc_chunk(trans, extent_root, flags); 4026 to_free = calc_csum_metadata_size(inode, num_bytes);
3444 spin_lock(&space_info->lock); 4027 if (nr_extents > 0)
4028 to_free += calc_trans_metadata_size(root, nr_extents);
4029
4030 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4031 to_free);
4032}
4033
4034int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4035{
4036 int ret;
4037
4038 ret = btrfs_check_data_free_space(inode, num_bytes);
3445 if (ret) 4039 if (ret)
3446 space_info->full = 1; 4040 return ret;
3447 space_info->force_alloc = 0; 4041
3448 spin_unlock(&space_info->lock); 4042 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
3449out: 4043 if (ret) {
3450 mutex_unlock(&extent_root->fs_info->chunk_mutex); 4044 btrfs_free_reserved_data_space(inode, num_bytes);
3451 return ret; 4045 return ret;
4046 }
4047
4048 return 0;
4049}
4050
4051void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4052{
4053 btrfs_delalloc_release_metadata(inode, num_bytes);
4054 btrfs_free_reserved_data_space(inode, num_bytes);
3452} 4055}
3453 4056
3454static int update_block_group(struct btrfs_trans_handle *trans, 4057static int update_block_group(struct btrfs_trans_handle *trans,
3455 struct btrfs_root *root, 4058 struct btrfs_root *root,
3456 u64 bytenr, u64 num_bytes, int alloc, 4059 u64 bytenr, u64 num_bytes, int alloc)
3457 int mark_free)
3458{ 4060{
3459 struct btrfs_block_group_cache *cache; 4061 struct btrfs_block_group_cache *cache = NULL;
3460 struct btrfs_fs_info *info = root->fs_info; 4062 struct btrfs_fs_info *info = root->fs_info;
3461 u64 total = num_bytes; 4063 u64 total = num_bytes;
3462 u64 old_val; 4064 u64 old_val;
3463 u64 byte_in_group; 4065 u64 byte_in_group;
4066 int factor;
3464 4067
3465 /* block accounting for super block */ 4068 /* block accounting for super block */
3466 spin_lock(&info->delalloc_lock); 4069 spin_lock(&info->delalloc_lock);
@@ -3476,11 +4079,31 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3476 cache = btrfs_lookup_block_group(info, bytenr); 4079 cache = btrfs_lookup_block_group(info, bytenr);
3477 if (!cache) 4080 if (!cache)
3478 return -1; 4081 return -1;
4082 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4083 BTRFS_BLOCK_GROUP_RAID1 |
4084 BTRFS_BLOCK_GROUP_RAID10))
4085 factor = 2;
4086 else
4087 factor = 1;
4088 /*
4089 * If this block group has free space cache written out, we
4090 * need to make sure to load it if we are removing space. This
4091 * is because we need the unpinning stage to actually add the
4092 * space back to the block group, otherwise we will leak space.
4093 */
4094 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4095 cache_block_group(cache, trans, NULL, 1);
4096
3479 byte_in_group = bytenr - cache->key.objectid; 4097 byte_in_group = bytenr - cache->key.objectid;
3480 WARN_ON(byte_in_group > cache->key.offset); 4098 WARN_ON(byte_in_group > cache->key.offset);
3481 4099
3482 spin_lock(&cache->space_info->lock); 4100 spin_lock(&cache->space_info->lock);
3483 spin_lock(&cache->lock); 4101 spin_lock(&cache->lock);
4102
4103 if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
4104 cache->disk_cache_state < BTRFS_DC_CLEAR)
4105 cache->disk_cache_state = BTRFS_DC_CLEAR;
4106
3484 cache->dirty = 1; 4107 cache->dirty = 1;
3485 old_val = btrfs_block_group_used(&cache->item); 4108 old_val = btrfs_block_group_used(&cache->item);
3486 num_bytes = min(total, cache->key.offset - byte_in_group); 4109 num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -3488,31 +4111,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3488 old_val += num_bytes; 4111 old_val += num_bytes;
3489 btrfs_set_block_group_used(&cache->item, old_val); 4112 btrfs_set_block_group_used(&cache->item, old_val);
3490 cache->reserved -= num_bytes; 4113 cache->reserved -= num_bytes;
3491 cache->space_info->bytes_used += num_bytes;
3492 cache->space_info->bytes_reserved -= num_bytes; 4114 cache->space_info->bytes_reserved -= num_bytes;
3493 if (cache->ro) 4115 cache->space_info->bytes_used += num_bytes;
3494 cache->space_info->bytes_readonly -= num_bytes; 4116 cache->space_info->disk_used += num_bytes * factor;
3495 spin_unlock(&cache->lock); 4117 spin_unlock(&cache->lock);
3496 spin_unlock(&cache->space_info->lock); 4118 spin_unlock(&cache->space_info->lock);
3497 } else { 4119 } else {
3498 old_val -= num_bytes; 4120 old_val -= num_bytes;
3499 cache->space_info->bytes_used -= num_bytes;
3500 if (cache->ro)
3501 cache->space_info->bytes_readonly += num_bytes;
3502 btrfs_set_block_group_used(&cache->item, old_val); 4121 btrfs_set_block_group_used(&cache->item, old_val);
4122 cache->pinned += num_bytes;
4123 cache->space_info->bytes_pinned += num_bytes;
4124 cache->space_info->bytes_used -= num_bytes;
4125 cache->space_info->disk_used -= num_bytes * factor;
3503 spin_unlock(&cache->lock); 4126 spin_unlock(&cache->lock);
3504 spin_unlock(&cache->space_info->lock); 4127 spin_unlock(&cache->space_info->lock);
3505 if (mark_free) {
3506 int ret;
3507
3508 ret = btrfs_discard_extent(root, bytenr,
3509 num_bytes);
3510 WARN_ON(ret);
3511 4128
3512 ret = btrfs_add_free_space(cache, bytenr, 4129 set_extent_dirty(info->pinned_extents,
3513 num_bytes); 4130 bytenr, bytenr + num_bytes - 1,
3514 WARN_ON(ret); 4131 GFP_NOFS | __GFP_NOFAIL);
3515 }
3516 } 4132 }
3517 btrfs_put_block_group(cache); 4133 btrfs_put_block_group(cache);
3518 total -= num_bytes; 4134 total -= num_bytes;
@@ -3536,18 +4152,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
3536 return bytenr; 4152 return bytenr;
3537} 4153}
3538 4154
3539/* 4155static int pin_down_extent(struct btrfs_root *root,
3540 * this function must be called within transaction 4156 struct btrfs_block_group_cache *cache,
3541 */ 4157 u64 bytenr, u64 num_bytes, int reserved)
3542int btrfs_pin_extent(struct btrfs_root *root,
3543 u64 bytenr, u64 num_bytes, int reserved)
3544{ 4158{
3545 struct btrfs_fs_info *fs_info = root->fs_info;
3546 struct btrfs_block_group_cache *cache;
3547
3548 cache = btrfs_lookup_block_group(fs_info, bytenr);
3549 BUG_ON(!cache);
3550
3551 spin_lock(&cache->space_info->lock); 4159 spin_lock(&cache->space_info->lock);
3552 spin_lock(&cache->lock); 4160 spin_lock(&cache->lock);
3553 cache->pinned += num_bytes; 4161 cache->pinned += num_bytes;
@@ -3559,28 +4167,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
3559 spin_unlock(&cache->lock); 4167 spin_unlock(&cache->lock);
3560 spin_unlock(&cache->space_info->lock); 4168 spin_unlock(&cache->space_info->lock);
3561 4169
3562 btrfs_put_block_group(cache); 4170 set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4171 bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4172 return 0;
4173}
3563 4174
3564 set_extent_dirty(fs_info->pinned_extents, 4175/*
3565 bytenr, bytenr + num_bytes - 1, GFP_NOFS); 4176 * this function must be called within transaction
4177 */
4178int btrfs_pin_extent(struct btrfs_root *root,
4179 u64 bytenr, u64 num_bytes, int reserved)
4180{
4181 struct btrfs_block_group_cache *cache;
4182
4183 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4184 BUG_ON(!cache);
4185
4186 pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4187
4188 btrfs_put_block_group(cache);
3566 return 0; 4189 return 0;
3567} 4190}
3568 4191
3569static int update_reserved_extents(struct btrfs_block_group_cache *cache, 4192/*
3570 u64 num_bytes, int reserve) 4193 * update size of reserved extents. this function may return -EAGAIN
4194 * if 'reserve' is true or 'sinfo' is false.
4195 */
4196static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
4197 u64 num_bytes, int reserve, int sinfo)
3571{ 4198{
3572 spin_lock(&cache->space_info->lock); 4199 int ret = 0;
3573 spin_lock(&cache->lock); 4200 if (sinfo) {
3574 if (reserve) { 4201 struct btrfs_space_info *space_info = cache->space_info;
3575 cache->reserved += num_bytes; 4202 spin_lock(&space_info->lock);
3576 cache->space_info->bytes_reserved += num_bytes; 4203 spin_lock(&cache->lock);
4204 if (reserve) {
4205 if (cache->ro) {
4206 ret = -EAGAIN;
4207 } else {
4208 cache->reserved += num_bytes;
4209 space_info->bytes_reserved += num_bytes;
4210 }
4211 } else {
4212 if (cache->ro)
4213 space_info->bytes_readonly += num_bytes;
4214 cache->reserved -= num_bytes;
4215 space_info->bytes_reserved -= num_bytes;
4216 }
4217 spin_unlock(&cache->lock);
4218 spin_unlock(&space_info->lock);
3577 } else { 4219 } else {
3578 cache->reserved -= num_bytes; 4220 spin_lock(&cache->lock);
3579 cache->space_info->bytes_reserved -= num_bytes; 4221 if (cache->ro) {
4222 ret = -EAGAIN;
4223 } else {
4224 if (reserve)
4225 cache->reserved += num_bytes;
4226 else
4227 cache->reserved -= num_bytes;
4228 }
4229 spin_unlock(&cache->lock);
3580 } 4230 }
3581 spin_unlock(&cache->lock); 4231 return ret;
3582 spin_unlock(&cache->space_info->lock);
3583 return 0;
3584} 4232}
3585 4233
3586int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 4234int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3611,6 +4259,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
3611 fs_info->pinned_extents = &fs_info->freed_extents[0]; 4259 fs_info->pinned_extents = &fs_info->freed_extents[0];
3612 4260
3613 up_write(&fs_info->extent_commit_sem); 4261 up_write(&fs_info->extent_commit_sem);
4262
4263 update_global_block_rsv(fs_info);
3614 return 0; 4264 return 0;
3615} 4265}
3616 4266
@@ -3637,14 +4287,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
3637 btrfs_add_free_space(cache, start, len); 4287 btrfs_add_free_space(cache, start, len);
3638 } 4288 }
3639 4289
4290 start += len;
4291
3640 spin_lock(&cache->space_info->lock); 4292 spin_lock(&cache->space_info->lock);
3641 spin_lock(&cache->lock); 4293 spin_lock(&cache->lock);
3642 cache->pinned -= len; 4294 cache->pinned -= len;
3643 cache->space_info->bytes_pinned -= len; 4295 cache->space_info->bytes_pinned -= len;
4296 if (cache->ro) {
4297 cache->space_info->bytes_readonly += len;
4298 } else if (cache->reserved_pinned > 0) {
4299 len = min(len, cache->reserved_pinned);
4300 cache->reserved_pinned -= len;
4301 cache->space_info->bytes_reserved += len;
4302 }
3644 spin_unlock(&cache->lock); 4303 spin_unlock(&cache->lock);
3645 spin_unlock(&cache->space_info->lock); 4304 spin_unlock(&cache->space_info->lock);
3646
3647 start += len;
3648 } 4305 }
3649 4306
3650 if (cache) 4307 if (cache)
@@ -3657,8 +4314,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3657{ 4314{
3658 struct btrfs_fs_info *fs_info = root->fs_info; 4315 struct btrfs_fs_info *fs_info = root->fs_info;
3659 struct extent_io_tree *unpin; 4316 struct extent_io_tree *unpin;
4317 struct btrfs_block_rsv *block_rsv;
4318 struct btrfs_block_rsv *next_rsv;
3660 u64 start; 4319 u64 start;
3661 u64 end; 4320 u64 end;
4321 int idx;
3662 int ret; 4322 int ret;
3663 4323
3664 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4324 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3679,59 +4339,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
3679 cond_resched(); 4339 cond_resched();
3680 } 4340 }
3681 4341
3682 return ret; 4342 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683} 4343 list_for_each_entry_safe(block_rsv, next_rsv,
3684 4344 &fs_info->durable_block_rsv_list, list) {
3685static int pin_down_bytes(struct btrfs_trans_handle *trans,
3686 struct btrfs_root *root,
3687 struct btrfs_path *path,
3688 u64 bytenr, u64 num_bytes,
3689 int is_data, int reserved,
3690 struct extent_buffer **must_clean)
3691{
3692 int err = 0;
3693 struct extent_buffer *buf;
3694
3695 if (is_data)
3696 goto pinit;
3697 4345
3698 /* 4346 idx = trans->transid & 0x1;
3699 * discard is sloooow, and so triggering discards on 4347 if (block_rsv->freed[idx] > 0) {
3700 * individual btree blocks isn't a good plan. Just 4348 block_rsv_add_bytes(block_rsv,
3701 * pin everything in discard mode. 4349 block_rsv->freed[idx], 0);
3702 */ 4350 block_rsv->freed[idx] = 0;
3703 if (btrfs_test_opt(root, DISCARD)) 4351 }
3704 goto pinit; 4352 if (atomic_read(&block_rsv->usage) == 0) {
3705 4353 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
3706 buf = btrfs_find_tree_block(root, bytenr, num_bytes);
3707 if (!buf)
3708 goto pinit;
3709 4354
3710 /* we can reuse a block if it hasn't been written 4355 if (block_rsv->freed[0] == 0 &&
3711 * and it is from this transaction. We can't 4356 block_rsv->freed[1] == 0) {
3712 * reuse anything from the tree log root because 4357 list_del_init(&block_rsv->list);
3713 * it has tiny sub-transactions. 4358 kfree(block_rsv);
3714 */ 4359 }
3715 if (btrfs_buffer_uptodate(buf, 0) && 4360 } else {
3716 btrfs_try_tree_lock(buf)) { 4361 btrfs_block_rsv_release(root, block_rsv, 0);
3717 u64 header_owner = btrfs_header_owner(buf);
3718 u64 header_transid = btrfs_header_generation(buf);
3719 if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
3720 header_transid == trans->transid &&
3721 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3722 *must_clean = buf;
3723 return 1;
3724 } 4362 }
3725 btrfs_tree_unlock(buf);
3726 } 4363 }
3727 free_extent_buffer(buf); 4364 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3728pinit:
3729 if (path)
3730 btrfs_set_path_blocking(path);
3731 /* unlocks the pinned mutex */
3732 btrfs_pin_extent(root, bytenr, num_bytes, reserved);
3733 4365
3734 BUG_ON(err < 0);
3735 return 0; 4366 return 0;
3736} 4367}
3737 4368
@@ -3892,9 +4523,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3892 BUG_ON(ret); 4523 BUG_ON(ret);
3893 } 4524 }
3894 } else { 4525 } else {
3895 int mark_free = 0;
3896 struct extent_buffer *must_clean = NULL;
3897
3898 if (found_extent) { 4526 if (found_extent) {
3899 BUG_ON(is_data && refs_to_drop != 4527 BUG_ON(is_data && refs_to_drop !=
3900 extent_data_ref_count(root, path, iref)); 4528 extent_data_ref_count(root, path, iref));
@@ -3907,31 +4535,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3907 } 4535 }
3908 } 4536 }
3909 4537
3910 ret = pin_down_bytes(trans, root, path, bytenr,
3911 num_bytes, is_data, 0, &must_clean);
3912 if (ret > 0)
3913 mark_free = 1;
3914 BUG_ON(ret < 0);
3915 /*
3916 * it is going to be very rare for someone to be waiting
3917 * on the block we're freeing. del_items might need to
3918 * schedule, so rather than get fancy, just force it
3919 * to blocking here
3920 */
3921 if (must_clean)
3922 btrfs_set_lock_blocking(must_clean);
3923
3924 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4538 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3925 num_to_del); 4539 num_to_del);
3926 BUG_ON(ret); 4540 BUG_ON(ret);
3927 btrfs_release_path(extent_root, path); 4541 btrfs_release_path(extent_root, path);
3928 4542
3929 if (must_clean) {
3930 clean_tree_block(NULL, root, must_clean);
3931 btrfs_tree_unlock(must_clean);
3932 free_extent_buffer(must_clean);
3933 }
3934
3935 if (is_data) { 4543 if (is_data) {
3936 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4544 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
3937 BUG_ON(ret); 4545 BUG_ON(ret);
@@ -3941,8 +4549,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3941 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT); 4549 (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
3942 } 4550 }
3943 4551
3944 ret = update_block_group(trans, root, bytenr, num_bytes, 0, 4552 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
3945 mark_free);
3946 BUG_ON(ret); 4553 BUG_ON(ret);
3947 } 4554 }
3948 btrfs_free_path(path); 4555 btrfs_free_path(path);
@@ -3950,7 +4557,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
3950} 4557}
3951 4558
3952/* 4559/*
3953 * when we free an extent, it is possible (and likely) that we free the last 4560 * when we free an block, it is possible (and likely) that we free the last
3954 * delayed ref for that extent as well. This searches the delayed ref tree for 4561 * delayed ref for that extent as well. This searches the delayed ref tree for
3955 * a given extent, and if there are no other delayed refs to be processed, it 4562 * a given extent, and if there are no other delayed refs to be processed, it
3956 * removes it from the tree. 4563 * removes it from the tree.
@@ -3962,7 +4569,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3962 struct btrfs_delayed_ref_root *delayed_refs; 4569 struct btrfs_delayed_ref_root *delayed_refs;
3963 struct btrfs_delayed_ref_node *ref; 4570 struct btrfs_delayed_ref_node *ref;
3964 struct rb_node *node; 4571 struct rb_node *node;
3965 int ret; 4572 int ret = 0;
3966 4573
3967 delayed_refs = &trans->transaction->delayed_refs; 4574 delayed_refs = &trans->transaction->delayed_refs;
3968 spin_lock(&delayed_refs->lock); 4575 spin_lock(&delayed_refs->lock);
@@ -4014,17 +4621,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
4014 list_del_init(&head->cluster); 4621 list_del_init(&head->cluster);
4015 spin_unlock(&delayed_refs->lock); 4622 spin_unlock(&delayed_refs->lock);
4016 4623
4017 ret = run_one_delayed_ref(trans, root->fs_info->tree_root, 4624 BUG_ON(head->extent_op);
4018 &head->node, head->extent_op, 4625 if (head->must_insert_reserved)
4019 head->must_insert_reserved); 4626 ret = 1;
4020 BUG_ON(ret); 4627
4628 mutex_unlock(&head->mutex);
4021 btrfs_put_delayed_ref(&head->node); 4629 btrfs_put_delayed_ref(&head->node);
4022 return 0; 4630 return ret;
4023out: 4631out:
4024 spin_unlock(&delayed_refs->lock); 4632 spin_unlock(&delayed_refs->lock);
4025 return 0; 4633 return 0;
4026} 4634}
4027 4635
4636void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4637 struct btrfs_root *root,
4638 struct extent_buffer *buf,
4639 u64 parent, int last_ref)
4640{
4641 struct btrfs_block_rsv *block_rsv;
4642 struct btrfs_block_group_cache *cache = NULL;
4643 int ret;
4644
4645 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4646 ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
4647 parent, root->root_key.objectid,
4648 btrfs_header_level(buf),
4649 BTRFS_DROP_DELAYED_REF, NULL);
4650 BUG_ON(ret);
4651 }
4652
4653 if (!last_ref)
4654 return;
4655
4656 block_rsv = get_block_rsv(trans, root);
4657 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4658 if (block_rsv->space_info != cache->space_info)
4659 goto out;
4660
4661 if (btrfs_header_generation(buf) == trans->transid) {
4662 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4663 ret = check_ref_cleanup(trans, root, buf->start);
4664 if (!ret)
4665 goto pin;
4666 }
4667
4668 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4669 pin_down_extent(root, cache, buf->start, buf->len, 1);
4670 goto pin;
4671 }
4672
4673 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4674
4675 btrfs_add_free_space(cache, buf->start, buf->len);
4676 ret = update_reserved_bytes(cache, buf->len, 0, 0);
4677 if (ret == -EAGAIN) {
4678 /* block group became read-only */
4679 update_reserved_bytes(cache, buf->len, 0, 1);
4680 goto out;
4681 }
4682
4683 ret = 1;
4684 spin_lock(&block_rsv->lock);
4685 if (block_rsv->reserved < block_rsv->size) {
4686 block_rsv->reserved += buf->len;
4687 ret = 0;
4688 }
4689 spin_unlock(&block_rsv->lock);
4690
4691 if (ret) {
4692 spin_lock(&cache->space_info->lock);
4693 cache->space_info->bytes_reserved -= buf->len;
4694 spin_unlock(&cache->space_info->lock);
4695 }
4696 goto out;
4697 }
4698pin:
4699 if (block_rsv->durable && !cache->ro) {
4700 ret = 0;
4701 spin_lock(&cache->lock);
4702 if (!cache->ro) {
4703 cache->reserved_pinned += buf->len;
4704 ret = 1;
4705 }
4706 spin_unlock(&cache->lock);
4707
4708 if (ret) {
4709 spin_lock(&block_rsv->lock);
4710 block_rsv->freed[trans->transid & 0x1] += buf->len;
4711 spin_unlock(&block_rsv->lock);
4712 }
4713 }
4714out:
4715 btrfs_put_block_group(cache);
4716}
4717
4028int btrfs_free_extent(struct btrfs_trans_handle *trans, 4718int btrfs_free_extent(struct btrfs_trans_handle *trans,
4029 struct btrfs_root *root, 4719 struct btrfs_root *root,
4030 u64 bytenr, u64 num_bytes, u64 parent, 4720 u64 bytenr, u64 num_bytes, u64 parent,
@@ -4046,8 +4736,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4046 parent, root_objectid, (int)owner, 4736 parent, root_objectid, (int)owner,
4047 BTRFS_DROP_DELAYED_REF, NULL); 4737 BTRFS_DROP_DELAYED_REF, NULL);
4048 BUG_ON(ret); 4738 BUG_ON(ret);
4049 ret = check_ref_cleanup(trans, root, bytenr);
4050 BUG_ON(ret);
4051 } else { 4739 } else {
4052 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes, 4740 ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
4053 parent, root_objectid, owner, 4741 parent, root_objectid, owner,
@@ -4057,21 +4745,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
4057 return ret; 4745 return ret;
4058} 4746}
4059 4747
4060int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4061 struct btrfs_root *root,
4062 u64 bytenr, u32 blocksize,
4063 u64 parent, u64 root_objectid, int level)
4064{
4065 u64 used;
4066 spin_lock(&root->node_lock);
4067 used = btrfs_root_used(&root->root_item) - blocksize;
4068 btrfs_set_root_used(&root->root_item, used);
4069 spin_unlock(&root->node_lock);
4070
4071 return btrfs_free_extent(trans, root, bytenr, blocksize,
4072 parent, root_objectid, level, 0);
4073}
4074
4075static u64 stripe_align(struct btrfs_root *root, u64 val) 4748static u64 stripe_align(struct btrfs_root *root, u64 val)
4076{ 4749{
4077 u64 mask = ((u64)root->stripesize - 1); 4750 u64 mask = ((u64)root->stripesize - 1);
@@ -4124,6 +4797,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
4124 return 0; 4797 return 0;
4125} 4798}
4126 4799
4800static int get_block_group_index(struct btrfs_block_group_cache *cache)
4801{
4802 int index;
4803 if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
4804 index = 0;
4805 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
4806 index = 1;
4807 else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
4808 index = 2;
4809 else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
4810 index = 3;
4811 else
4812 index = 4;
4813 return index;
4814}
4815
4127enum btrfs_loop_type { 4816enum btrfs_loop_type {
4128 LOOP_FIND_IDEAL = 0, 4817 LOOP_FIND_IDEAL = 0,
4129 LOOP_CACHING_NOWAIT = 1, 4818 LOOP_CACHING_NOWAIT = 1,
@@ -4145,7 +4834,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4145 u64 num_bytes, u64 empty_size, 4834 u64 num_bytes, u64 empty_size,
4146 u64 search_start, u64 search_end, 4835 u64 search_start, u64 search_end,
4147 u64 hint_byte, struct btrfs_key *ins, 4836 u64 hint_byte, struct btrfs_key *ins,
4148 u64 exclude_start, u64 exclude_nr,
4149 int data) 4837 int data)
4150{ 4838{
4151 int ret = 0; 4839 int ret = 0;
@@ -4158,9 +4846,11 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4158 struct btrfs_space_info *space_info; 4846 struct btrfs_space_info *space_info;
4159 int last_ptr_loop = 0; 4847 int last_ptr_loop = 0;
4160 int loop = 0; 4848 int loop = 0;
4849 int index = 0;
4161 bool found_uncached_bg = false; 4850 bool found_uncached_bg = false;
4162 bool failed_cluster_refill = false; 4851 bool failed_cluster_refill = false;
4163 bool failed_alloc = false; 4852 bool failed_alloc = false;
4853 bool use_cluster = true;
4164 u64 ideal_cache_percent = 0; 4854 u64 ideal_cache_percent = 0;
4165 u64 ideal_cache_offset = 0; 4855 u64 ideal_cache_offset = 0;
4166 4856
@@ -4170,17 +4860,29 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4170 ins->offset = 0; 4860 ins->offset = 0;
4171 4861
4172 space_info = __find_space_info(root->fs_info, data); 4862 space_info = __find_space_info(root->fs_info, data);
4863 if (!space_info) {
4864 printk(KERN_ERR "No space info for %d\n", data);
4865 return -ENOSPC;
4866 }
4867
4868 /*
4869 * If the space info is for both data and metadata it means we have a
4870 * small filesystem and we can't use the clustering stuff.
4871 */
4872 if (btrfs_mixed_space_info(space_info))
4873 use_cluster = false;
4173 4874
4174 if (orig_root->ref_cows || empty_size) 4875 if (orig_root->ref_cows || empty_size)
4175 allowed_chunk_alloc = 1; 4876 allowed_chunk_alloc = 1;
4176 4877
4177 if (data & BTRFS_BLOCK_GROUP_METADATA) { 4878 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
4178 last_ptr = &root->fs_info->meta_alloc_cluster; 4879 last_ptr = &root->fs_info->meta_alloc_cluster;
4179 if (!btrfs_test_opt(root, SSD)) 4880 if (!btrfs_test_opt(root, SSD))
4180 empty_cluster = 64 * 1024; 4881 empty_cluster = 64 * 1024;
4181 } 4882 }
4182 4883
4183 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { 4884 if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
4885 btrfs_test_opt(root, SSD)) {
4184 last_ptr = &root->fs_info->data_alloc_cluster; 4886 last_ptr = &root->fs_info->data_alloc_cluster;
4185 } 4887 }
4186 4888
@@ -4223,6 +4925,7 @@ ideal_cache:
4223 btrfs_put_block_group(block_group); 4925 btrfs_put_block_group(block_group);
4224 up_read(&space_info->groups_sem); 4926 up_read(&space_info->groups_sem);
4225 } else { 4927 } else {
4928 index = get_block_group_index(block_group);
4226 goto have_block_group; 4929 goto have_block_group;
4227 } 4930 }
4228 } else if (block_group) { 4931 } else if (block_group) {
@@ -4231,17 +4934,42 @@ ideal_cache:
4231 } 4934 }
4232search: 4935search:
4233 down_read(&space_info->groups_sem); 4936 down_read(&space_info->groups_sem);
4234 list_for_each_entry(block_group, &space_info->block_groups, list) { 4937 list_for_each_entry(block_group, &space_info->block_groups[index],
4938 list) {
4235 u64 offset; 4939 u64 offset;
4236 int cached; 4940 int cached;
4237 4941
4238 btrfs_get_block_group(block_group); 4942 btrfs_get_block_group(block_group);
4239 search_start = block_group->key.objectid; 4943 search_start = block_group->key.objectid;
4240 4944
4945 /*
4946 * this can happen if we end up cycling through all the
4947 * raid types, but we want to make sure we only allocate
4948 * for the proper type.
4949 */
4950 if (!block_group_bits(block_group, data)) {
4951 u64 extra = BTRFS_BLOCK_GROUP_DUP |
4952 BTRFS_BLOCK_GROUP_RAID1 |
4953 BTRFS_BLOCK_GROUP_RAID10;
4954
4955 /*
4956 * if they asked for extra copies and this block group
4957 * doesn't provide them, bail. This does allow us to
4958 * fill raid0 from raid1.
4959 */
4960 if ((data & extra) && !(block_group->flags & extra))
4961 goto loop;
4962 }
4963
4241have_block_group: 4964have_block_group:
4242 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4965 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4243 u64 free_percent; 4966 u64 free_percent;
4244 4967
4968 ret = cache_block_group(block_group, trans,
4969 orig_root, 1);
4970 if (block_group->cached == BTRFS_CACHE_FINISHED)
4971 goto have_block_group;
4972
4245 free_percent = btrfs_block_group_used(&block_group->item); 4973 free_percent = btrfs_block_group_used(&block_group->item);
4246 free_percent *= 100; 4974 free_percent *= 100;
4247 free_percent = div64_u64(free_percent, 4975 free_percent = div64_u64(free_percent,
@@ -4262,7 +4990,8 @@ have_block_group:
4262 if (loop > LOOP_CACHING_NOWAIT || 4990 if (loop > LOOP_CACHING_NOWAIT ||
4263 (loop > LOOP_FIND_IDEAL && 4991 (loop > LOOP_FIND_IDEAL &&
4264 atomic_read(&space_info->caching_threads) < 2)) { 4992 atomic_read(&space_info->caching_threads) < 2)) {
4265 ret = cache_block_group(block_group); 4993 ret = cache_block_group(block_group, trans,
4994 orig_root, 0);
4266 BUG_ON(ret); 4995 BUG_ON(ret);
4267 } 4996 }
4268 found_uncached_bg = true; 4997 found_uncached_bg = true;
@@ -4422,23 +5151,22 @@ checks:
4422 goto loop; 5151 goto loop;
4423 } 5152 }
4424 5153
4425 if (exclude_nr > 0 && 5154 ins->objectid = search_start;
4426 (search_start + num_bytes > exclude_start && 5155 ins->offset = num_bytes;
4427 search_start < exclude_start + exclude_nr)) {
4428 search_start = exclude_start + exclude_nr;
4429 5156
5157 if (offset < search_start)
5158 btrfs_add_free_space(block_group, offset,
5159 search_start - offset);
5160 BUG_ON(offset > search_start);
5161
5162 ret = update_reserved_bytes(block_group, num_bytes, 1,
5163 (data & BTRFS_BLOCK_GROUP_DATA));
5164 if (ret == -EAGAIN) {
4430 btrfs_add_free_space(block_group, offset, num_bytes); 5165 btrfs_add_free_space(block_group, offset, num_bytes);
4431 /*
4432 * if search_start is still in this block group
4433 * then we just re-search this block group
4434 */
4435 if (search_start >= block_group->key.objectid &&
4436 search_start < (block_group->key.objectid +
4437 block_group->key.offset))
4438 goto have_block_group;
4439 goto loop; 5166 goto loop;
4440 } 5167 }
4441 5168
5169 /* we are all good, lets return */
4442 ins->objectid = search_start; 5170 ins->objectid = search_start;
4443 ins->offset = num_bytes; 5171 ins->offset = num_bytes;
4444 5172
@@ -4446,18 +5174,18 @@ checks:
4446 btrfs_add_free_space(block_group, offset, 5174 btrfs_add_free_space(block_group, offset,
4447 search_start - offset); 5175 search_start - offset);
4448 BUG_ON(offset > search_start); 5176 BUG_ON(offset > search_start);
4449
4450 update_reserved_extents(block_group, num_bytes, 1);
4451
4452 /* we are all good, lets return */
4453 break; 5177 break;
4454loop: 5178loop:
4455 failed_cluster_refill = false; 5179 failed_cluster_refill = false;
4456 failed_alloc = false; 5180 failed_alloc = false;
5181 BUG_ON(index != get_block_group_index(block_group));
4457 btrfs_put_block_group(block_group); 5182 btrfs_put_block_group(block_group);
4458 } 5183 }
4459 up_read(&space_info->groups_sem); 5184 up_read(&space_info->groups_sem);
4460 5185
5186 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5187 goto search;
5188
4461 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for 5189 /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
4462 * for them to make caching progress. Also 5190 * for them to make caching progress. Also
4463 * determine the best possible bg to cache 5191 * determine the best possible bg to cache
@@ -4471,6 +5199,7 @@ loop:
4471 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 5199 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
4472 (found_uncached_bg || empty_size || empty_cluster || 5200 (found_uncached_bg || empty_size || empty_cluster ||
4473 allowed_chunk_alloc)) { 5201 allowed_chunk_alloc)) {
5202 index = 0;
4474 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 5203 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4475 found_uncached_bg = false; 5204 found_uncached_bg = false;
4476 loop++; 5205 loop++;
@@ -4553,31 +5282,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4553 int dump_block_groups) 5282 int dump_block_groups)
4554{ 5283{
4555 struct btrfs_block_group_cache *cache; 5284 struct btrfs_block_group_cache *cache;
5285 int index = 0;
4556 5286
4557 spin_lock(&info->lock); 5287 spin_lock(&info->lock);
4558 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5288 printk(KERN_INFO "space_info has %llu free, is %sfull\n",
4559 (unsigned long long)(info->total_bytes - info->bytes_used - 5289 (unsigned long long)(info->total_bytes - info->bytes_used -
4560 info->bytes_pinned - info->bytes_reserved - 5290 info->bytes_pinned - info->bytes_reserved -
4561 info->bytes_super), 5291 info->bytes_readonly),
4562 (info->full) ? "" : "not "); 5292 (info->full) ? "" : "not ");
4563 printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," 5293 printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
4564 " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu" 5294 "reserved=%llu, may_use=%llu, readonly=%llu\n",
4565 "\n",
4566 (unsigned long long)info->total_bytes, 5295 (unsigned long long)info->total_bytes,
5296 (unsigned long long)info->bytes_used,
4567 (unsigned long long)info->bytes_pinned, 5297 (unsigned long long)info->bytes_pinned,
4568 (unsigned long long)info->bytes_delalloc, 5298 (unsigned long long)info->bytes_reserved,
4569 (unsigned long long)info->bytes_may_use, 5299 (unsigned long long)info->bytes_may_use,
4570 (unsigned long long)info->bytes_used, 5300 (unsigned long long)info->bytes_readonly);
4571 (unsigned long long)info->bytes_root,
4572 (unsigned long long)info->bytes_super,
4573 (unsigned long long)info->bytes_reserved);
4574 spin_unlock(&info->lock); 5301 spin_unlock(&info->lock);
4575 5302
4576 if (!dump_block_groups) 5303 if (!dump_block_groups)
4577 return; 5304 return;
4578 5305
4579 down_read(&info->groups_sem); 5306 down_read(&info->groups_sem);
4580 list_for_each_entry(cache, &info->block_groups, list) { 5307again:
5308 list_for_each_entry(cache, &info->block_groups[index], list) {
4581 spin_lock(&cache->lock); 5309 spin_lock(&cache->lock);
4582 printk(KERN_INFO "block group %llu has %llu bytes, %llu used " 5310 printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
4583 "%llu pinned %llu reserved\n", 5311 "%llu pinned %llu reserved\n",
@@ -4589,6 +5317,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
4589 btrfs_dump_free_space(cache, bytes); 5317 btrfs_dump_free_space(cache, bytes);
4590 spin_unlock(&cache->lock); 5318 spin_unlock(&cache->lock);
4591 } 5319 }
5320 if (++index < BTRFS_NR_RAID_TYPES)
5321 goto again;
4592 up_read(&info->groups_sem); 5322 up_read(&info->groups_sem);
4593} 5323}
4594 5324
@@ -4614,9 +5344,8 @@ again:
4614 5344
4615 WARN_ON(num_bytes < root->sectorsize); 5345 WARN_ON(num_bytes < root->sectorsize);
4616 ret = find_free_extent(trans, root, num_bytes, empty_size, 5346 ret = find_free_extent(trans, root, num_bytes, empty_size,
4617 search_start, search_end, hint_byte, ins, 5347 search_start, search_end, hint_byte,
4618 trans->alloc_exclude_start, 5348 ins, data);
4619 trans->alloc_exclude_nr, data);
4620 5349
4621 if (ret == -ENOSPC && num_bytes > min_alloc_size) { 5350 if (ret == -ENOSPC && num_bytes > min_alloc_size) {
4622 num_bytes = num_bytes >> 1; 5351 num_bytes = num_bytes >> 1;
@@ -4654,7 +5383,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
4654 ret = btrfs_discard_extent(root, start, len); 5383 ret = btrfs_discard_extent(root, start, len);
4655 5384
4656 btrfs_add_free_space(cache, start, len); 5385 btrfs_add_free_space(cache, start, len);
4657 update_reserved_extents(cache, len, 0); 5386 update_reserved_bytes(cache, len, 0, 1);
4658 btrfs_put_block_group(cache); 5387 btrfs_put_block_group(cache);
4659 5388
4660 return ret; 5389 return ret;
@@ -4717,8 +5446,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4717 btrfs_mark_buffer_dirty(path->nodes[0]); 5446 btrfs_mark_buffer_dirty(path->nodes[0]);
4718 btrfs_free_path(path); 5447 btrfs_free_path(path);
4719 5448
4720 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5449 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4721 1, 0);
4722 if (ret) { 5450 if (ret) {
4723 printk(KERN_ERR "btrfs update block group failed for %llu " 5451 printk(KERN_ERR "btrfs update block group failed for %llu "
4724 "%llu\n", (unsigned long long)ins->objectid, 5452 "%llu\n", (unsigned long long)ins->objectid,
@@ -4778,8 +5506,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4778 btrfs_mark_buffer_dirty(leaf); 5506 btrfs_mark_buffer_dirty(leaf);
4779 btrfs_free_path(path); 5507 btrfs_free_path(path);
4780 5508
4781 ret = update_block_group(trans, root, ins->objectid, ins->offset, 5509 ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
4782 1, 0);
4783 if (ret) { 5510 if (ret) {
4784 printk(KERN_ERR "btrfs update block group failed for %llu " 5511 printk(KERN_ERR "btrfs update block group failed for %llu "
4785 "%llu\n", (unsigned long long)ins->objectid, 5512 "%llu\n", (unsigned long long)ins->objectid,
@@ -4821,7 +5548,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4821 u64 num_bytes = ins->offset; 5548 u64 num_bytes = ins->offset;
4822 5549
4823 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5550 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
4824 cache_block_group(block_group); 5551 cache_block_group(block_group, trans, NULL, 0);
4825 caching_ctl = get_caching_control(block_group); 5552 caching_ctl = get_caching_control(block_group);
4826 5553
4827 if (!caching_ctl) { 5554 if (!caching_ctl) {
@@ -4855,73 +5582,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4855 put_caching_control(caching_ctl); 5582 put_caching_control(caching_ctl);
4856 } 5583 }
4857 5584
4858 update_reserved_extents(block_group, ins->offset, 1); 5585 ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
5586 BUG_ON(ret);
4859 btrfs_put_block_group(block_group); 5587 btrfs_put_block_group(block_group);
4860 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5588 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
4861 0, owner, offset, ins, 1); 5589 0, owner, offset, ins, 1);
4862 return ret; 5590 return ret;
4863} 5591}
4864 5592
4865/*
4866 * finds a free extent and does all the dirty work required for allocation
4867 * returns the key for the extent through ins, and a tree buffer for
4868 * the first block of the extent through buf.
4869 *
4870 * returns 0 if everything worked, non-zero otherwise.
4871 */
4872static int alloc_tree_block(struct btrfs_trans_handle *trans,
4873 struct btrfs_root *root,
4874 u64 num_bytes, u64 parent, u64 root_objectid,
4875 struct btrfs_disk_key *key, int level,
4876 u64 empty_size, u64 hint_byte, u64 search_end,
4877 struct btrfs_key *ins)
4878{
4879 int ret;
4880 u64 flags = 0;
4881
4882 ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
4883 empty_size, hint_byte, search_end,
4884 ins, 0);
4885 if (ret)
4886 return ret;
4887
4888 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4889 if (parent == 0)
4890 parent = ins->objectid;
4891 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4892 } else
4893 BUG_ON(parent > 0);
4894
4895 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4896 struct btrfs_delayed_extent_op *extent_op;
4897 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
4898 BUG_ON(!extent_op);
4899 if (key)
4900 memcpy(&extent_op->key, key, sizeof(extent_op->key));
4901 else
4902 memset(&extent_op->key, 0, sizeof(extent_op->key));
4903 extent_op->flags_to_set = flags;
4904 extent_op->update_key = 1;
4905 extent_op->update_flags = 1;
4906 extent_op->is_data = 0;
4907
4908 ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
4909 ins->offset, parent, root_objectid,
4910 level, BTRFS_ADD_DELAYED_EXTENT,
4911 extent_op);
4912 BUG_ON(ret);
4913 }
4914
4915 if (root_objectid == root->root_key.objectid) {
4916 u64 used;
4917 spin_lock(&root->node_lock);
4918 used = btrfs_root_used(&root->root_item) + num_bytes;
4919 btrfs_set_root_used(&root->root_item, used);
4920 spin_unlock(&root->node_lock);
4921 }
4922 return ret;
4923}
4924
4925struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, 5593struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4926 struct btrfs_root *root, 5594 struct btrfs_root *root,
4927 u64 bytenr, u32 blocksize, 5595 u64 bytenr, u32 blocksize,
@@ -4960,8 +5628,41 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
4960 return buf; 5628 return buf;
4961} 5629}
4962 5630
5631static struct btrfs_block_rsv *
5632use_block_rsv(struct btrfs_trans_handle *trans,
5633 struct btrfs_root *root, u32 blocksize)
5634{
5635 struct btrfs_block_rsv *block_rsv;
5636 int ret;
5637
5638 block_rsv = get_block_rsv(trans, root);
5639
5640 if (block_rsv->size == 0) {
5641 ret = reserve_metadata_bytes(trans, root, block_rsv,
5642 blocksize, 0);
5643 if (ret)
5644 return ERR_PTR(ret);
5645 return block_rsv;
5646 }
5647
5648 ret = block_rsv_use_bytes(block_rsv, blocksize);
5649 if (!ret)
5650 return block_rsv;
5651
5652 return ERR_PTR(-ENOSPC);
5653}
5654
5655static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
5656{
5657 block_rsv_add_bytes(block_rsv, blocksize, 0);
5658 block_rsv_release_bytes(block_rsv, NULL, 0);
5659}
5660
4963/* 5661/*
4964 * helper function to allocate a block for a given tree 5662 * finds a free extent and does all the dirty work required for allocation
5663 * returns the key for the extent through ins, and a tree buffer for
5664 * the first block of the extent through buf.
5665 *
4965 * returns the tree buffer or NULL. 5666 * returns the tree buffer or NULL.
4966 */ 5667 */
4967struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, 5668struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4971,18 +5672,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
4971 u64 hint, u64 empty_size) 5672 u64 hint, u64 empty_size)
4972{ 5673{
4973 struct btrfs_key ins; 5674 struct btrfs_key ins;
4974 int ret; 5675 struct btrfs_block_rsv *block_rsv;
4975 struct extent_buffer *buf; 5676 struct extent_buffer *buf;
5677 u64 flags = 0;
5678 int ret;
4976 5679
4977 ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid, 5680
4978 key, level, empty_size, hint, (u64)-1, &ins); 5681 block_rsv = use_block_rsv(trans, root, blocksize);
5682 if (IS_ERR(block_rsv))
5683 return ERR_CAST(block_rsv);
5684
5685 ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
5686 empty_size, hint, (u64)-1, &ins, 0);
4979 if (ret) { 5687 if (ret) {
4980 BUG_ON(ret > 0); 5688 unuse_block_rsv(block_rsv, blocksize);
4981 return ERR_PTR(ret); 5689 return ERR_PTR(ret);
4982 } 5690 }
4983 5691
4984 buf = btrfs_init_new_buffer(trans, root, ins.objectid, 5692 buf = btrfs_init_new_buffer(trans, root, ins.objectid,
4985 blocksize, level); 5693 blocksize, level);
5694 BUG_ON(IS_ERR(buf));
5695
5696 if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
5697 if (parent == 0)
5698 parent = ins.objectid;
5699 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
5700 } else
5701 BUG_ON(parent > 0);
5702
5703 if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
5704 struct btrfs_delayed_extent_op *extent_op;
5705 extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
5706 BUG_ON(!extent_op);
5707 if (key)
5708 memcpy(&extent_op->key, key, sizeof(extent_op->key));
5709 else
5710 memset(&extent_op->key, 0, sizeof(extent_op->key));
5711 extent_op->flags_to_set = flags;
5712 extent_op->update_key = 1;
5713 extent_op->update_flags = 1;
5714 extent_op->is_data = 0;
5715
5716 ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
5717 ins.offset, parent, root_objectid,
5718 level, BTRFS_ADD_DELAYED_EXTENT,
5719 extent_op);
5720 BUG_ON(ret);
5721 }
4986 return buf; 5722 return buf;
4987} 5723}
4988 5724
@@ -5011,7 +5747,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5011 u64 generation; 5747 u64 generation;
5012 u64 refs; 5748 u64 refs;
5013 u64 flags; 5749 u64 flags;
5014 u64 last = 0;
5015 u32 nritems; 5750 u32 nritems;
5016 u32 blocksize; 5751 u32 blocksize;
5017 struct btrfs_key key; 5752 struct btrfs_key key;
@@ -5079,7 +5814,6 @@ reada:
5079 generation); 5814 generation);
5080 if (ret) 5815 if (ret)
5081 break; 5816 break;
5082 last = bytenr + blocksize;
5083 nread++; 5817 nread++;
5084 } 5818 }
5085 wc->reada_slot = slot; 5819 wc->reada_slot = slot;
@@ -5205,6 +5939,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5205 next = btrfs_find_tree_block(root, bytenr, blocksize); 5939 next = btrfs_find_tree_block(root, bytenr, blocksize);
5206 if (!next) { 5940 if (!next) {
5207 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 5941 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
5942 if (!next)
5943 return -ENOMEM;
5208 reada = 1; 5944 reada = 1;
5209 } 5945 }
5210 btrfs_tree_lock(next); 5946 btrfs_tree_lock(next);
@@ -5305,7 +6041,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5305 struct btrfs_path *path, 6041 struct btrfs_path *path,
5306 struct walk_control *wc) 6042 struct walk_control *wc)
5307{ 6043{
5308 int ret = 0; 6044 int ret;
5309 int level = wc->level; 6045 int level = wc->level;
5310 struct extent_buffer *eb = path->nodes[level]; 6046 struct extent_buffer *eb = path->nodes[level];
5311 u64 parent = 0; 6047 u64 parent = 0;
@@ -5383,13 +6119,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5383 btrfs_header_owner(path->nodes[level + 1])); 6119 btrfs_header_owner(path->nodes[level + 1]));
5384 } 6120 }
5385 6121
5386 ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent, 6122 btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5387 root->root_key.objectid, level, 0);
5388 BUG_ON(ret);
5389out: 6123out:
5390 wc->refs[level] = 0; 6124 wc->refs[level] = 0;
5391 wc->flags[level] = 0; 6125 wc->flags[level] = 0;
5392 return ret; 6126 return 0;
5393} 6127}
5394 6128
5395static noinline int walk_down_tree(struct btrfs_trans_handle *trans, 6129static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5402,10 +6136,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5402 int ret; 6136 int ret;
5403 6137
5404 while (level >= 0) { 6138 while (level >= 0) {
5405 if (path->slots[level] >=
5406 btrfs_header_nritems(path->nodes[level]))
5407 break;
5408
5409 ret = walk_down_proc(trans, root, path, wc, lookup_info); 6139 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5410 if (ret > 0) 6140 if (ret > 0)
5411 break; 6141 break;
@@ -5413,11 +6143,16 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5413 if (level == 0) 6143 if (level == 0)
5414 break; 6144 break;
5415 6145
6146 if (path->slots[level] >=
6147 btrfs_header_nritems(path->nodes[level]))
6148 break;
6149
5416 ret = do_walk_down(trans, root, path, wc, &lookup_info); 6150 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5417 if (ret > 0) { 6151 if (ret > 0) {
5418 path->slots[level]++; 6152 path->slots[level]++;
5419 continue; 6153 continue;
5420 } 6154 } else if (ret < 0)
6155 return ret;
5421 level = wc->level; 6156 level = wc->level;
5422 } 6157 }
5423 return 0; 6158 return 0;
@@ -5466,7 +6201,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5466 * also make sure backrefs for the shared block and all lower level 6201 * also make sure backrefs for the shared block and all lower level
5467 * blocks are properly updated. 6202 * blocks are properly updated.
5468 */ 6203 */
5469int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref) 6204int btrfs_drop_snapshot(struct btrfs_root *root,
6205 struct btrfs_block_rsv *block_rsv, int update_ref)
5470{ 6206{
5471 struct btrfs_path *path; 6207 struct btrfs_path *path;
5472 struct btrfs_trans_handle *trans; 6208 struct btrfs_trans_handle *trans;
@@ -5484,7 +6220,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5484 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6220 wc = kzalloc(sizeof(*wc), GFP_NOFS);
5485 BUG_ON(!wc); 6221 BUG_ON(!wc);
5486 6222
5487 trans = btrfs_start_transaction(tree_root, 1); 6223 trans = btrfs_start_transaction(tree_root, 0);
6224 if (block_rsv)
6225 trans->block_rsv = block_rsv;
5488 6226
5489 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { 6227 if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5490 level = btrfs_header_level(root->node); 6228 level = btrfs_header_level(root->node);
@@ -5572,22 +6310,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5572 } 6310 }
5573 6311
5574 BUG_ON(wc->level == 0); 6312 BUG_ON(wc->level == 0);
5575 if (trans->transaction->in_commit || 6313 if (btrfs_should_end_transaction(trans, tree_root)) {
5576 trans->transaction->delayed_refs.flushing) {
5577 ret = btrfs_update_root(trans, tree_root, 6314 ret = btrfs_update_root(trans, tree_root,
5578 &root->root_key, 6315 &root->root_key,
5579 root_item); 6316 root_item);
5580 BUG_ON(ret); 6317 BUG_ON(ret);
5581 6318
5582 btrfs_end_transaction(trans, tree_root); 6319 btrfs_end_transaction_throttle(trans, tree_root);
5583 trans = btrfs_start_transaction(tree_root, 1); 6320 trans = btrfs_start_transaction(tree_root, 0);
5584 } else { 6321 if (block_rsv)
5585 unsigned long update; 6322 trans->block_rsv = block_rsv;
5586 update = trans->delayed_ref_updates;
5587 trans->delayed_ref_updates = 0;
5588 if (update)
5589 btrfs_run_delayed_refs(trans, tree_root,
5590 update);
5591 } 6323 }
5592 } 6324 }
5593 btrfs_release_path(root, path); 6325 btrfs_release_path(root, path);
@@ -5601,9 +6333,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5601 NULL, NULL); 6333 NULL, NULL);
5602 BUG_ON(ret < 0); 6334 BUG_ON(ret < 0);
5603 if (ret > 0) { 6335 if (ret > 0) {
5604 ret = btrfs_del_orphan_item(trans, tree_root, 6336 /* if we fail to delete the orphan item this time
5605 root->root_key.objectid); 6337 * around, it'll get picked up the next time.
5606 BUG_ON(ret); 6338 *
6339 * The most common failure here is just -ENOENT.
6340 */
6341 btrfs_del_orphan_item(trans, tree_root,
6342 root->root_key.objectid);
5607 } 6343 }
5608 } 6344 }
5609 6345
@@ -5615,7 +6351,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
5615 kfree(root); 6351 kfree(root);
5616 } 6352 }
5617out: 6353out:
5618 btrfs_end_transaction(trans, tree_root); 6354 btrfs_end_transaction_throttle(trans, tree_root);
5619 kfree(wc); 6355 kfree(wc);
5620 btrfs_free_path(path); 6356 btrfs_free_path(path);
5621 return err; 6357 return err;
@@ -6561,6 +7297,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6561 struct btrfs_key key; 7297 struct btrfs_key key;
6562 struct inode *inode = NULL; 7298 struct inode *inode = NULL;
6563 struct btrfs_file_extent_item *fi; 7299 struct btrfs_file_extent_item *fi;
7300 struct extent_state *cached_state = NULL;
6564 u64 num_bytes; 7301 u64 num_bytes;
6565 u64 skip_objectid = 0; 7302 u64 skip_objectid = 0;
6566 u32 nritems; 7303 u32 nritems;
@@ -6589,12 +7326,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6589 } 7326 }
6590 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 7327 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6591 7328
6592 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 7329 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6593 key.offset + num_bytes - 1, GFP_NOFS); 7330 key.offset + num_bytes - 1, 0, &cached_state,
7331 GFP_NOFS);
6594 btrfs_drop_extent_cache(inode, key.offset, 7332 btrfs_drop_extent_cache(inode, key.offset,
6595 key.offset + num_bytes - 1, 1); 7333 key.offset + num_bytes - 1, 1);
6596 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 7334 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6597 key.offset + num_bytes - 1, GFP_NOFS); 7335 key.offset + num_bytes - 1, &cached_state,
7336 GFP_NOFS);
6598 cond_resched(); 7337 cond_resched();
6599 } 7338 }
6600 iput(inode); 7339 iput(inode);
@@ -7176,7 +7915,14 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7176 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 7915 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7177 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 7916 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7178 7917
7179 num_devices = root->fs_info->fs_devices->rw_devices; 7918 /*
7919 * we add in the count of missing devices because we want
7920 * to make sure that any RAID levels on a degraded FS
7921 * continue to be honored.
7922 */
7923 num_devices = root->fs_info->fs_devices->rw_devices +
7924 root->fs_info->fs_devices->missing_devices;
7925
7180 if (num_devices == 1) { 7926 if (num_devices == 1) {
7181 stripped |= BTRFS_BLOCK_GROUP_DUP; 7927 stripped |= BTRFS_BLOCK_GROUP_DUP;
7182 stripped = flags & ~stripped; 7928 stripped = flags & ~stripped;
@@ -7208,48 +7954,137 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7208 return flags; 7954 return flags;
7209} 7955}
7210 7956
7211static int __alloc_chunk_for_shrink(struct btrfs_root *root, 7957static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7212 struct btrfs_block_group_cache *shrink_block_group, 7958{
7213 int force) 7959 struct btrfs_space_info *sinfo = cache->space_info;
7960 u64 num_bytes;
7961 int ret = -ENOSPC;
7962
7963 if (cache->ro)
7964 return 0;
7965
7966 spin_lock(&sinfo->lock);
7967 spin_lock(&cache->lock);
7968 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7969 cache->bytes_super - btrfs_block_group_used(&cache->item);
7970
7971 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7972 sinfo->bytes_may_use + sinfo->bytes_readonly +
7973 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
7974 sinfo->bytes_readonly += num_bytes;
7975 sinfo->bytes_reserved += cache->reserved_pinned;
7976 cache->reserved_pinned = 0;
7977 cache->ro = 1;
7978 ret = 0;
7979 }
7980
7981 spin_unlock(&cache->lock);
7982 spin_unlock(&sinfo->lock);
7983 return ret;
7984}
7985
7986int btrfs_set_block_group_ro(struct btrfs_root *root,
7987 struct btrfs_block_group_cache *cache)
7988
7214{ 7989{
7215 struct btrfs_trans_handle *trans; 7990 struct btrfs_trans_handle *trans;
7216 u64 new_alloc_flags; 7991 u64 alloc_flags;
7217 u64 calc; 7992 int ret;
7218 7993
7219 spin_lock(&shrink_block_group->lock); 7994 BUG_ON(cache->ro);
7220 if (btrfs_block_group_used(&shrink_block_group->item) +
7221 shrink_block_group->reserved > 0) {
7222 spin_unlock(&shrink_block_group->lock);
7223 7995
7224 trans = btrfs_start_transaction(root, 1); 7996 trans = btrfs_join_transaction(root, 1);
7225 spin_lock(&shrink_block_group->lock); 7997 BUG_ON(IS_ERR(trans));
7226 7998
7227 new_alloc_flags = update_block_group_flags(root, 7999 alloc_flags = update_block_group_flags(root, cache->flags);
7228 shrink_block_group->flags); 8000 if (alloc_flags != cache->flags)
7229 if (new_alloc_flags != shrink_block_group->flags) { 8001 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
7230 calc = 8002
7231 btrfs_block_group_used(&shrink_block_group->item); 8003 ret = set_block_group_ro(cache);
7232 } else { 8004 if (!ret)
7233 calc = shrink_block_group->key.offset; 8005 goto out;
8006 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
8007 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
8008 if (ret < 0)
8009 goto out;
8010 ret = set_block_group_ro(cache);
8011out:
8012 btrfs_end_transaction(trans, root);
8013 return ret;
8014}
8015
8016/*
8017 * helper to account the unused space of all the readonly block group in the
8018 * list. takes mirrors into account.
8019 */
8020static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
8021{
8022 struct btrfs_block_group_cache *block_group;
8023 u64 free_bytes = 0;
8024 int factor;
8025
8026 list_for_each_entry(block_group, groups_list, list) {
8027 spin_lock(&block_group->lock);
8028
8029 if (!block_group->ro) {
8030 spin_unlock(&block_group->lock);
8031 continue;
7234 } 8032 }
7235 spin_unlock(&shrink_block_group->lock);
7236 8033
7237 do_chunk_alloc(trans, root->fs_info->extent_root, 8034 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7238 calc + 2 * 1024 * 1024, new_alloc_flags, force); 8035 BTRFS_BLOCK_GROUP_RAID10 |
8036 BTRFS_BLOCK_GROUP_DUP))
8037 factor = 2;
8038 else
8039 factor = 1;
7239 8040
7240 btrfs_end_transaction(trans, root); 8041 free_bytes += (block_group->key.offset -
7241 } else 8042 btrfs_block_group_used(&block_group->item)) *
7242 spin_unlock(&shrink_block_group->lock); 8043 factor;
7243 return 0; 8044
8045 spin_unlock(&block_group->lock);
8046 }
8047
8048 return free_bytes;
7244} 8049}
7245 8050
8051/*
8052 * helper to account the unused space of all the readonly block group in the
8053 * space_info. takes mirrors into account.
8054 */
8055u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
8056{
8057 int i;
8058 u64 free_bytes = 0;
8059
8060 spin_lock(&sinfo->lock);
8061
8062 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
8063 if (!list_empty(&sinfo->block_groups[i]))
8064 free_bytes += __btrfs_get_ro_block_group_free_space(
8065 &sinfo->block_groups[i]);
8066
8067 spin_unlock(&sinfo->lock);
7246 8068
7247int btrfs_prepare_block_group_relocation(struct btrfs_root *root, 8069 return free_bytes;
7248 struct btrfs_block_group_cache *group) 8070}
7249 8071
8072int btrfs_set_block_group_rw(struct btrfs_root *root,
8073 struct btrfs_block_group_cache *cache)
7250{ 8074{
7251 __alloc_chunk_for_shrink(root, group, 1); 8075 struct btrfs_space_info *sinfo = cache->space_info;
7252 set_block_group_readonly(group); 8076 u64 num_bytes;
8077
8078 BUG_ON(!cache->ro);
8079
8080 spin_lock(&sinfo->lock);
8081 spin_lock(&cache->lock);
8082 num_bytes = cache->key.offset - cache->reserved - cache->pinned -
8083 cache->bytes_super - btrfs_block_group_used(&cache->item);
8084 sinfo->bytes_readonly -= num_bytes;
8085 cache->ro = 0;
8086 spin_unlock(&cache->lock);
8087 spin_unlock(&sinfo->lock);
7253 return 0; 8088 return 0;
7254} 8089}
7255 8090
@@ -7314,7 +8149,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7314 mutex_lock(&root->fs_info->chunk_mutex); 8149 mutex_lock(&root->fs_info->chunk_mutex);
7315 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 8150 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7316 u64 min_free = btrfs_block_group_used(&block_group->item); 8151 u64 min_free = btrfs_block_group_used(&block_group->item);
7317 u64 dev_offset, max_avail; 8152 u64 dev_offset;
7318 8153
7319 /* 8154 /*
7320 * check to make sure we can actually find a chunk with enough 8155 * check to make sure we can actually find a chunk with enough
@@ -7322,7 +8157,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7322 */ 8157 */
7323 if (device->total_bytes > device->bytes_used + min_free) { 8158 if (device->total_bytes > device->bytes_used + min_free) {
7324 ret = find_free_dev_extent(NULL, device, min_free, 8159 ret = find_free_dev_extent(NULL, device, min_free,
7325 &dev_offset, &max_avail); 8160 &dev_offset, NULL);
7326 if (!ret) 8161 if (!ret)
7327 break; 8162 break;
7328 ret = -1; 8163 ret = -1;
@@ -7366,11 +8201,44 @@ static int find_first_block_group(struct btrfs_root *root,
7366 } 8201 }
7367 path->slots[0]++; 8202 path->slots[0]++;
7368 } 8203 }
7369 ret = -ENOENT;
7370out: 8204out:
7371 return ret; 8205 return ret;
7372} 8206}
7373 8207
8208void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
8209{
8210 struct btrfs_block_group_cache *block_group;
8211 u64 last = 0;
8212
8213 while (1) {
8214 struct inode *inode;
8215
8216 block_group = btrfs_lookup_first_block_group(info, last);
8217 while (block_group) {
8218 spin_lock(&block_group->lock);
8219 if (block_group->iref)
8220 break;
8221 spin_unlock(&block_group->lock);
8222 block_group = next_block_group(info->tree_root,
8223 block_group);
8224 }
8225 if (!block_group) {
8226 if (last == 0)
8227 break;
8228 last = 0;
8229 continue;
8230 }
8231
8232 inode = block_group->inode;
8233 block_group->iref = 0;
8234 block_group->inode = NULL;
8235 spin_unlock(&block_group->lock);
8236 iput(inode);
8237 last = block_group->key.objectid + block_group->key.offset;
8238 btrfs_put_block_group(block_group);
8239 }
8240}
8241
7374int btrfs_free_block_groups(struct btrfs_fs_info *info) 8242int btrfs_free_block_groups(struct btrfs_fs_info *info)
7375{ 8243{
7376 struct btrfs_block_group_cache *block_group; 8244 struct btrfs_block_group_cache *block_group;
@@ -7417,17 +8285,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7417 */ 8285 */
7418 synchronize_rcu(); 8286 synchronize_rcu();
7419 8287
8288 release_global_block_rsv(info);
8289
7420 while(!list_empty(&info->space_info)) { 8290 while(!list_empty(&info->space_info)) {
7421 space_info = list_entry(info->space_info.next, 8291 space_info = list_entry(info->space_info.next,
7422 struct btrfs_space_info, 8292 struct btrfs_space_info,
7423 list); 8293 list);
7424 8294 if (space_info->bytes_pinned > 0 ||
8295 space_info->bytes_reserved > 0) {
8296 WARN_ON(1);
8297 dump_space_info(space_info, 0, 0);
8298 }
7425 list_del(&space_info->list); 8299 list_del(&space_info->list);
7426 kfree(space_info); 8300 kfree(space_info);
7427 } 8301 }
7428 return 0; 8302 return 0;
7429} 8303}
7430 8304
8305static void __link_block_group(struct btrfs_space_info *space_info,
8306 struct btrfs_block_group_cache *cache)
8307{
8308 int index = get_block_group_index(cache);
8309
8310 down_write(&space_info->groups_sem);
8311 list_add_tail(&cache->list, &space_info->block_groups[index]);
8312 up_write(&space_info->groups_sem);
8313}
8314
7431int btrfs_read_block_groups(struct btrfs_root *root) 8315int btrfs_read_block_groups(struct btrfs_root *root)
7432{ 8316{
7433 struct btrfs_path *path; 8317 struct btrfs_path *path;
@@ -7438,6 +8322,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7438 struct btrfs_key key; 8322 struct btrfs_key key;
7439 struct btrfs_key found_key; 8323 struct btrfs_key found_key;
7440 struct extent_buffer *leaf; 8324 struct extent_buffer *leaf;
8325 int need_clear = 0;
8326 u64 cache_gen;
7441 8327
7442 root = info->extent_root; 8328 root = info->extent_root;
7443 key.objectid = 0; 8329 key.objectid = 0;
@@ -7447,21 +8333,27 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7447 if (!path) 8333 if (!path)
7448 return -ENOMEM; 8334 return -ENOMEM;
7449 8335
8336 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
8337 if (cache_gen != 0 &&
8338 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
8339 need_clear = 1;
8340 if (btrfs_test_opt(root, CLEAR_CACHE))
8341 need_clear = 1;
8342 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
8343 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
8344
7450 while (1) { 8345 while (1) {
7451 ret = find_first_block_group(root, path, &key); 8346 ret = find_first_block_group(root, path, &key);
7452 if (ret > 0) { 8347 if (ret > 0)
7453 ret = 0; 8348 break;
7454 goto error;
7455 }
7456 if (ret != 0) 8349 if (ret != 0)
7457 goto error; 8350 goto error;
7458
7459 leaf = path->nodes[0]; 8351 leaf = path->nodes[0];
7460 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 8352 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7461 cache = kzalloc(sizeof(*cache), GFP_NOFS); 8353 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7462 if (!cache) { 8354 if (!cache) {
7463 ret = -ENOMEM; 8355 ret = -ENOMEM;
7464 break; 8356 goto error;
7465 } 8357 }
7466 8358
7467 atomic_set(&cache->count, 1); 8359 atomic_set(&cache->count, 1);
@@ -7471,6 +8363,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7471 INIT_LIST_HEAD(&cache->list); 8363 INIT_LIST_HEAD(&cache->list);
7472 INIT_LIST_HEAD(&cache->cluster_list); 8364 INIT_LIST_HEAD(&cache->cluster_list);
7473 8365
8366 if (need_clear)
8367 cache->disk_cache_state = BTRFS_DC_CLEAR;
8368
7474 /* 8369 /*
7475 * we only want to have 32k of ram per block group for keeping 8370 * we only want to have 32k of ram per block group for keeping
7476 * track of free space, and if we pass 1/2 of that we want to 8371 * track of free space, and if we pass 1/2 of that we want to
@@ -7518,20 +8413,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7518 BUG_ON(ret); 8413 BUG_ON(ret);
7519 cache->space_info = space_info; 8414 cache->space_info = space_info;
7520 spin_lock(&cache->space_info->lock); 8415 spin_lock(&cache->space_info->lock);
7521 cache->space_info->bytes_super += cache->bytes_super; 8416 cache->space_info->bytes_readonly += cache->bytes_super;
7522 spin_unlock(&cache->space_info->lock); 8417 spin_unlock(&cache->space_info->lock);
7523 8418
7524 down_write(&space_info->groups_sem); 8419 __link_block_group(space_info, cache);
7525 list_add_tail(&cache->list, &space_info->block_groups);
7526 up_write(&space_info->groups_sem);
7527 8420
7528 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8421 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7529 BUG_ON(ret); 8422 BUG_ON(ret);
7530 8423
7531 set_avail_alloc_bits(root->fs_info, cache->flags); 8424 set_avail_alloc_bits(root->fs_info, cache->flags);
7532 if (btrfs_chunk_readonly(root, cache->key.objectid)) 8425 if (btrfs_chunk_readonly(root, cache->key.objectid))
7533 set_block_group_readonly(cache); 8426 set_block_group_ro(cache);
7534 } 8427 }
8428
8429 list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8430 if (!(get_alloc_profile(root, space_info->flags) &
8431 (BTRFS_BLOCK_GROUP_RAID10 |
8432 BTRFS_BLOCK_GROUP_RAID1 |
8433 BTRFS_BLOCK_GROUP_DUP)))
8434 continue;
8435 /*
8436 * avoid allocating from un-mirrored block group if there are
8437 * mirrored block groups.
8438 */
8439 list_for_each_entry(cache, &space_info->block_groups[3], list)
8440 set_block_group_ro(cache);
8441 list_for_each_entry(cache, &space_info->block_groups[4], list)
8442 set_block_group_ro(cache);
8443 }
8444
8445 init_global_block_rsv(info);
7535 ret = 0; 8446 ret = 0;
7536error: 8447error:
7537 btrfs_free_path(path); 8448 btrfs_free_path(path);
@@ -7559,6 +8470,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7559 cache->key.offset = size; 8470 cache->key.offset = size;
7560 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 8471 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7561 cache->sectorsize = root->sectorsize; 8472 cache->sectorsize = root->sectorsize;
8473 cache->fs_info = root->fs_info;
7562 8474
7563 /* 8475 /*
7564 * we only want to have 32k of ram per block group for keeping track 8476 * we only want to have 32k of ram per block group for keeping track
@@ -7592,12 +8504,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7592 BUG_ON(ret); 8504 BUG_ON(ret);
7593 8505
7594 spin_lock(&cache->space_info->lock); 8506 spin_lock(&cache->space_info->lock);
7595 cache->space_info->bytes_super += cache->bytes_super; 8507 cache->space_info->bytes_readonly += cache->bytes_super;
7596 spin_unlock(&cache->space_info->lock); 8508 spin_unlock(&cache->space_info->lock);
7597 8509
7598 down_write(&cache->space_info->groups_sem); 8510 __link_block_group(cache->space_info, cache);
7599 list_add_tail(&cache->list, &cache->space_info->block_groups);
7600 up_write(&cache->space_info->groups_sem);
7601 8511
7602 ret = btrfs_add_block_group_cache(root->fs_info, cache); 8512 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7603 BUG_ON(ret); 8513 BUG_ON(ret);
@@ -7617,8 +8527,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7617 struct btrfs_path *path; 8527 struct btrfs_path *path;
7618 struct btrfs_block_group_cache *block_group; 8528 struct btrfs_block_group_cache *block_group;
7619 struct btrfs_free_cluster *cluster; 8529 struct btrfs_free_cluster *cluster;
8530 struct btrfs_root *tree_root = root->fs_info->tree_root;
7620 struct btrfs_key key; 8531 struct btrfs_key key;
8532 struct inode *inode;
7621 int ret; 8533 int ret;
8534 int factor;
7622 8535
7623 root = root->fs_info->extent_root; 8536 root = root->fs_info->extent_root;
7624 8537
@@ -7627,6 +8540,12 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7627 BUG_ON(!block_group->ro); 8540 BUG_ON(!block_group->ro);
7628 8541
7629 memcpy(&key, &block_group->key, sizeof(key)); 8542 memcpy(&key, &block_group->key, sizeof(key));
8543 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8544 BTRFS_BLOCK_GROUP_RAID1 |
8545 BTRFS_BLOCK_GROUP_RAID10))
8546 factor = 2;
8547 else
8548 factor = 1;
7630 8549
7631 /* make sure this block group isn't part of an allocation cluster */ 8550 /* make sure this block group isn't part of an allocation cluster */
7632 cluster = &root->fs_info->data_alloc_cluster; 8551 cluster = &root->fs_info->data_alloc_cluster;
@@ -7646,6 +8565,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7646 path = btrfs_alloc_path(); 8565 path = btrfs_alloc_path();
7647 BUG_ON(!path); 8566 BUG_ON(!path);
7648 8567
8568 inode = lookup_free_space_inode(root, block_group, path);
8569 if (!IS_ERR(inode)) {
8570 btrfs_orphan_add(trans, inode);
8571 clear_nlink(inode);
8572 /* One for the block groups ref */
8573 spin_lock(&block_group->lock);
8574 if (block_group->iref) {
8575 block_group->iref = 0;
8576 block_group->inode = NULL;
8577 spin_unlock(&block_group->lock);
8578 iput(inode);
8579 } else {
8580 spin_unlock(&block_group->lock);
8581 }
8582 /* One for our lookup ref */
8583 iput(inode);
8584 }
8585
8586 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8587 key.offset = block_group->key.objectid;
8588 key.type = 0;
8589
8590 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8591 if (ret < 0)
8592 goto out;
8593 if (ret > 0)
8594 btrfs_release_path(tree_root, path);
8595 if (ret == 0) {
8596 ret = btrfs_del_item(trans, tree_root, path);
8597 if (ret)
8598 goto out;
8599 btrfs_release_path(tree_root, path);
8600 }
8601
7649 spin_lock(&root->fs_info->block_group_cache_lock); 8602 spin_lock(&root->fs_info->block_group_cache_lock);
7650 rb_erase(&block_group->cache_node, 8603 rb_erase(&block_group->cache_node,
7651 &root->fs_info->block_group_cache_tree); 8604 &root->fs_info->block_group_cache_tree);
@@ -7667,8 +8620,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7667 spin_lock(&block_group->space_info->lock); 8620 spin_lock(&block_group->space_info->lock);
7668 block_group->space_info->total_bytes -= block_group->key.offset; 8621 block_group->space_info->total_bytes -= block_group->key.offset;
7669 block_group->space_info->bytes_readonly -= block_group->key.offset; 8622 block_group->space_info->bytes_readonly -= block_group->key.offset;
8623 block_group->space_info->disk_total -= block_group->key.offset * factor;
7670 spin_unlock(&block_group->space_info->lock); 8624 spin_unlock(&block_group->space_info->lock);
7671 8625
8626 memcpy(&key, &block_group->key, sizeof(key));
8627
7672 btrfs_clear_space_info_full(root->fs_info); 8628 btrfs_clear_space_info_full(root->fs_info);
7673 8629
7674 btrfs_put_block_group(block_group); 8630 btrfs_put_block_group(block_group);
@@ -7685,3 +8641,14 @@ out:
7685 btrfs_free_path(path); 8641 btrfs_free_path(path);
7686 return ret; 8642 return ret;
7687} 8643}
8644
8645int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8646{
8647 return unpin_extent_range(root, start, end);
8648}
8649
8650int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8651 u64 num_bytes)
8652{
8653 return btrfs_discard_extent(root, bytenr, num_bytes);
8654}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 96577e8bf9fd..2e993cf1766e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2,7 +2,6 @@
2#include <linux/slab.h> 2#include <linux/slab.h>
3#include <linux/bio.h> 3#include <linux/bio.h>
4#include <linux/mm.h> 4#include <linux/mm.h>
5#include <linux/gfp.h>
6#include <linux/pagemap.h> 5#include <linux/pagemap.h>
7#include <linux/page-flags.h> 6#include <linux/page-flags.h>
8#include <linux/module.h> 7#include <linux/module.h>
@@ -104,8 +103,8 @@ void extent_io_exit(void)
104void extent_io_tree_init(struct extent_io_tree *tree, 103void extent_io_tree_init(struct extent_io_tree *tree,
105 struct address_space *mapping, gfp_t mask) 104 struct address_space *mapping, gfp_t mask)
106{ 105{
107 tree->state.rb_node = NULL; 106 tree->state = RB_ROOT;
108 tree->buffer.rb_node = NULL; 107 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
109 tree->ops = NULL; 108 tree->ops = NULL;
110 tree->dirty_bytes = 0; 109 tree->dirty_bytes = 0;
111 spin_lock_init(&tree->lock); 110 spin_lock_init(&tree->lock);
@@ -136,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
136 return state; 135 return state;
137} 136}
138 137
139static void free_extent_state(struct extent_state *state) 138void free_extent_state(struct extent_state *state)
140{ 139{
141 if (!state) 140 if (!state)
142 return; 141 return;
@@ -236,50 +235,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
236 return ret; 235 return ret;
237} 236}
238 237
239static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
240 u64 offset, struct rb_node *node)
241{
242 struct rb_root *root = &tree->buffer;
243 struct rb_node **p = &root->rb_node;
244 struct rb_node *parent = NULL;
245 struct extent_buffer *eb;
246
247 while (*p) {
248 parent = *p;
249 eb = rb_entry(parent, struct extent_buffer, rb_node);
250
251 if (offset < eb->start)
252 p = &(*p)->rb_left;
253 else if (offset > eb->start)
254 p = &(*p)->rb_right;
255 else
256 return eb;
257 }
258
259 rb_link_node(node, parent, p);
260 rb_insert_color(node, root);
261 return NULL;
262}
263
264static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
265 u64 offset)
266{
267 struct rb_root *root = &tree->buffer;
268 struct rb_node *n = root->rb_node;
269 struct extent_buffer *eb;
270
271 while (n) {
272 eb = rb_entry(n, struct extent_buffer, rb_node);
273 if (offset < eb->start)
274 n = n->rb_left;
275 else if (offset > eb->start)
276 n = n->rb_right;
277 else
278 return eb;
279 }
280 return NULL;
281}
282
283static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 238static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 struct extent_state *other) 239 struct extent_state *other)
285{ 240{
@@ -336,21 +291,18 @@ static int merge_state(struct extent_io_tree *tree,
336} 291}
337 292
338static int set_state_cb(struct extent_io_tree *tree, 293static int set_state_cb(struct extent_io_tree *tree,
339 struct extent_state *state, 294 struct extent_state *state, int *bits)
340 unsigned long bits)
341{ 295{
342 if (tree->ops && tree->ops->set_bit_hook) { 296 if (tree->ops && tree->ops->set_bit_hook) {
343 return tree->ops->set_bit_hook(tree->mapping->host, 297 return tree->ops->set_bit_hook(tree->mapping->host,
344 state->start, state->end, 298 state, bits);
345 state->state, bits);
346 } 299 }
347 300
348 return 0; 301 return 0;
349} 302}
350 303
351static void clear_state_cb(struct extent_io_tree *tree, 304static void clear_state_cb(struct extent_io_tree *tree,
352 struct extent_state *state, 305 struct extent_state *state, int *bits)
353 unsigned long bits)
354{ 306{
355 if (tree->ops && tree->ops->clear_bit_hook) 307 if (tree->ops && tree->ops->clear_bit_hook)
356 tree->ops->clear_bit_hook(tree->mapping->host, state, bits); 308 tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -368,9 +320,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
368 */ 320 */
369static int insert_state(struct extent_io_tree *tree, 321static int insert_state(struct extent_io_tree *tree,
370 struct extent_state *state, u64 start, u64 end, 322 struct extent_state *state, u64 start, u64 end,
371 int bits) 323 int *bits)
372{ 324{
373 struct rb_node *node; 325 struct rb_node *node;
326 int bits_to_set = *bits & ~EXTENT_CTLBITS;
374 int ret; 327 int ret;
375 328
376 if (end < start) { 329 if (end < start) {
@@ -385,9 +338,9 @@ static int insert_state(struct extent_io_tree *tree,
385 if (ret) 338 if (ret)
386 return ret; 339 return ret;
387 340
388 if (bits & EXTENT_DIRTY) 341 if (bits_to_set & EXTENT_DIRTY)
389 tree->dirty_bytes += end - start + 1; 342 tree->dirty_bytes += end - start + 1;
390 state->state |= bits; 343 state->state |= bits_to_set;
391 node = tree_insert(&tree->state, end, &state->rb_node); 344 node = tree_insert(&tree->state, end, &state->rb_node);
392 if (node) { 345 if (node) {
393 struct extent_state *found; 346 struct extent_state *found;
@@ -457,13 +410,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
457 * struct is freed and removed from the tree 410 * struct is freed and removed from the tree
458 */ 411 */
459static int clear_state_bit(struct extent_io_tree *tree, 412static int clear_state_bit(struct extent_io_tree *tree,
460 struct extent_state *state, int bits, int wake, 413 struct extent_state *state,
461 int delete) 414 int *bits, int wake)
462{ 415{
463 int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING; 416 int bits_to_clear = *bits & ~EXTENT_CTLBITS;
464 int ret = state->state & bits_to_clear; 417 int ret = state->state & bits_to_clear;
465 418
466 if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { 419 if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
467 u64 range = state->end - state->start + 1; 420 u64 range = state->end - state->start + 1;
468 WARN_ON(range > tree->dirty_bytes); 421 WARN_ON(range > tree->dirty_bytes);
469 tree->dirty_bytes -= range; 422 tree->dirty_bytes -= range;
@@ -472,9 +425,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
472 state->state &= ~bits_to_clear; 425 state->state &= ~bits_to_clear;
473 if (wake) 426 if (wake)
474 wake_up(&state->wq); 427 wake_up(&state->wq);
475 if (delete || state->state == 0) { 428 if (state->state == 0) {
476 if (state->tree) { 429 if (state->tree) {
477 clear_state_cb(tree, state, state->state);
478 rb_erase(&state->rb_node, &tree->state); 430 rb_erase(&state->rb_node, &tree->state);
479 state->tree = NULL; 431 state->tree = NULL;
480 free_extent_state(state); 432 free_extent_state(state);
@@ -513,7 +465,14 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
513 u64 last_end; 465 u64 last_end;
514 int err; 466 int err;
515 int set = 0; 467 int set = 0;
468 int clear = 0;
469
470 if (delete)
471 bits |= ~EXTENT_CTLBITS;
472 bits |= EXTENT_FIRST_DELALLOC;
516 473
474 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
475 clear = 1;
517again: 476again:
518 if (!prealloc && (mask & __GFP_WAIT)) { 477 if (!prealloc && (mask & __GFP_WAIT)) {
519 prealloc = alloc_extent_state(mask); 478 prealloc = alloc_extent_state(mask);
@@ -524,14 +483,20 @@ again:
524 spin_lock(&tree->lock); 483 spin_lock(&tree->lock);
525 if (cached_state) { 484 if (cached_state) {
526 cached = *cached_state; 485 cached = *cached_state;
527 *cached_state = NULL; 486
528 cached_state = NULL; 487 if (clear) {
488 *cached_state = NULL;
489 cached_state = NULL;
490 }
491
529 if (cached && cached->tree && cached->start == start) { 492 if (cached && cached->tree && cached->start == start) {
530 atomic_dec(&cached->refs); 493 if (clear)
494 atomic_dec(&cached->refs);
531 state = cached; 495 state = cached;
532 goto hit_next; 496 goto hit_next;
533 } 497 }
534 free_extent_state(cached); 498 if (clear)
499 free_extent_state(cached);
535 } 500 }
536 /* 501 /*
537 * this search will find the extents that end after 502 * this search will find the extents that end after
@@ -572,8 +537,7 @@ hit_next:
572 if (err) 537 if (err)
573 goto out; 538 goto out;
574 if (state->end <= end) { 539 if (state->end <= end) {
575 set |= clear_state_bit(tree, state, bits, wake, 540 set |= clear_state_bit(tree, state, &bits, wake);
576 delete);
577 if (last_end == (u64)-1) 541 if (last_end == (u64)-1)
578 goto out; 542 goto out;
579 start = last_end + 1; 543 start = last_end + 1;
@@ -594,7 +558,7 @@ hit_next:
594 if (wake) 558 if (wake)
595 wake_up(&state->wq); 559 wake_up(&state->wq);
596 560
597 set |= clear_state_bit(tree, prealloc, bits, wake, delete); 561 set |= clear_state_bit(tree, prealloc, &bits, wake);
598 562
599 prealloc = NULL; 563 prealloc = NULL;
600 goto out; 564 goto out;
@@ -605,7 +569,7 @@ hit_next:
605 else 569 else
606 next_node = NULL; 570 next_node = NULL;
607 571
608 set |= clear_state_bit(tree, state, bits, wake, delete); 572 set |= clear_state_bit(tree, state, &bits, wake);
609 if (last_end == (u64)-1) 573 if (last_end == (u64)-1)
610 goto out; 574 goto out;
611 start = last_end + 1; 575 start = last_end + 1;
@@ -698,19 +662,19 @@ out:
698 662
699static int set_state_bits(struct extent_io_tree *tree, 663static int set_state_bits(struct extent_io_tree *tree,
700 struct extent_state *state, 664 struct extent_state *state,
701 int bits) 665 int *bits)
702{ 666{
703 int ret; 667 int ret;
668 int bits_to_set = *bits & ~EXTENT_CTLBITS;
704 669
705 ret = set_state_cb(tree, state, bits); 670 ret = set_state_cb(tree, state, bits);
706 if (ret) 671 if (ret)
707 return ret; 672 return ret;
708 673 if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
709 if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
710 u64 range = state->end - state->start + 1; 674 u64 range = state->end - state->start + 1;
711 tree->dirty_bytes += range; 675 tree->dirty_bytes += range;
712 } 676 }
713 state->state |= bits; 677 state->state |= bits_to_set;
714 678
715 return 0; 679 return 0;
716} 680}
@@ -737,10 +701,9 @@ static void cache_state(struct extent_state *state,
737 * [start, end] is inclusive This takes the tree lock. 701 * [start, end] is inclusive This takes the tree lock.
738 */ 702 */
739 703
740static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, 704int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
741 int bits, int exclusive_bits, u64 *failed_start, 705 int bits, int exclusive_bits, u64 *failed_start,
742 struct extent_state **cached_state, 706 struct extent_state **cached_state, gfp_t mask)
743 gfp_t mask)
744{ 707{
745 struct extent_state *state; 708 struct extent_state *state;
746 struct extent_state *prealloc = NULL; 709 struct extent_state *prealloc = NULL;
@@ -749,6 +712,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
749 u64 last_start; 712 u64 last_start;
750 u64 last_end; 713 u64 last_end;
751 714
715 bits |= EXTENT_FIRST_DELALLOC;
752again: 716again:
753 if (!prealloc && (mask & __GFP_WAIT)) { 717 if (!prealloc && (mask & __GFP_WAIT)) {
754 prealloc = alloc_extent_state(mask); 718 prealloc = alloc_extent_state(mask);
@@ -770,7 +734,7 @@ again:
770 */ 734 */
771 node = tree_search(tree, start); 735 node = tree_search(tree, start);
772 if (!node) { 736 if (!node) {
773 err = insert_state(tree, prealloc, start, end, bits); 737 err = insert_state(tree, prealloc, start, end, &bits);
774 prealloc = NULL; 738 prealloc = NULL;
775 BUG_ON(err == -EEXIST); 739 BUG_ON(err == -EEXIST);
776 goto out; 740 goto out;
@@ -794,7 +758,7 @@ hit_next:
794 goto out; 758 goto out;
795 } 759 }
796 760
797 err = set_state_bits(tree, state, bits); 761 err = set_state_bits(tree, state, &bits);
798 if (err) 762 if (err)
799 goto out; 763 goto out;
800 764
@@ -844,7 +808,7 @@ hit_next:
844 if (err) 808 if (err)
845 goto out; 809 goto out;
846 if (state->end <= end) { 810 if (state->end <= end) {
847 err = set_state_bits(tree, state, bits); 811 err = set_state_bits(tree, state, &bits);
848 if (err) 812 if (err)
849 goto out; 813 goto out;
850 cache_state(state, cached_state); 814 cache_state(state, cached_state);
@@ -869,7 +833,7 @@ hit_next:
869 else 833 else
870 this_end = last_start - 1; 834 this_end = last_start - 1;
871 err = insert_state(tree, prealloc, start, this_end, 835 err = insert_state(tree, prealloc, start, this_end,
872 bits); 836 &bits);
873 BUG_ON(err == -EEXIST); 837 BUG_ON(err == -EEXIST);
874 if (err) { 838 if (err) {
875 prealloc = NULL; 839 prealloc = NULL;
@@ -895,7 +859,7 @@ hit_next:
895 err = split_state(tree, state, prealloc, end + 1); 859 err = split_state(tree, state, prealloc, end + 1);
896 BUG_ON(err == -EEXIST); 860 BUG_ON(err == -EEXIST);
897 861
898 err = set_state_bits(tree, prealloc, bits); 862 err = set_state_bits(tree, prealloc, &bits);
899 if (err) { 863 if (err) {
900 prealloc = NULL; 864 prealloc = NULL;
901 goto out; 865 goto out;
@@ -946,11 +910,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
946} 910}
947 911
948int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 912int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 gfp_t mask) 913 struct extent_state **cached_state, gfp_t mask)
950{ 914{
951 return set_extent_bit(tree, start, end, 915 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 916 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953 0, NULL, NULL, mask); 917 0, NULL, cached_state, mask);
954} 918}
955 919
956int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 920int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -958,8 +922,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
958{ 922{
959 return clear_extent_bit(tree, start, end, 923 return clear_extent_bit(tree, start, end,
960 EXTENT_DIRTY | EXTENT_DELALLOC | 924 EXTENT_DIRTY | EXTENT_DELALLOC |
961 EXTENT_DO_ACCOUNTING, 0, 0, 925 EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
962 NULL, mask);
963} 926}
964 927
965int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 928int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +947,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
984} 947}
985 948
986static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 949static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
987 u64 end, gfp_t mask) 950 u64 end, struct extent_state **cached_state,
951 gfp_t mask)
988{ 952{
989 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 953 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
990 NULL, mask); 954 cached_state, mask);
991} 955}
992 956
993int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 957int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1135,8 @@ out:
1171 * 1 is returned if we find something, 0 if nothing was in the tree 1135 * 1 is returned if we find something, 0 if nothing was in the tree
1172 */ 1136 */
1173static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1137static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1174 u64 *start, u64 *end, u64 max_bytes) 1138 u64 *start, u64 *end, u64 max_bytes,
1139 struct extent_state **cached_state)
1175{ 1140{
1176 struct rb_node *node; 1141 struct rb_node *node;
1177 struct extent_state *state; 1142 struct extent_state *state;
@@ -1203,8 +1168,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1203 *end = state->end; 1168 *end = state->end;
1204 goto out; 1169 goto out;
1205 } 1170 }
1206 if (!found) 1171 if (!found) {
1207 *start = state->start; 1172 *start = state->start;
1173 *cached_state = state;
1174 atomic_inc(&state->refs);
1175 }
1208 found++; 1176 found++;
1209 *end = state->end; 1177 *end = state->end;
1210 cur_start = state->end + 1; 1178 cur_start = state->end + 1;
@@ -1336,10 +1304,11 @@ again:
1336 delalloc_start = *start; 1304 delalloc_start = *start;
1337 delalloc_end = 0; 1305 delalloc_end = 0;
1338 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1306 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1339 max_bytes); 1307 max_bytes, &cached_state);
1340 if (!found || delalloc_end <= *start) { 1308 if (!found || delalloc_end <= *start) {
1341 *start = delalloc_start; 1309 *start = delalloc_start;
1342 *end = delalloc_end; 1310 *end = delalloc_end;
1311 free_extent_state(cached_state);
1343 return found; 1312 return found;
1344 } 1313 }
1345 1314
@@ -1421,9 +1390,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1421 if (op & EXTENT_CLEAR_DELALLOC) 1390 if (op & EXTENT_CLEAR_DELALLOC)
1422 clear_bits |= EXTENT_DELALLOC; 1391 clear_bits |= EXTENT_DELALLOC;
1423 1392
1424 if (op & EXTENT_CLEAR_ACCOUNTING)
1425 clear_bits |= EXTENT_DO_ACCOUNTING;
1426
1427 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); 1393 clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1428 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 1394 if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1429 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK | 1395 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1722,7 +1688,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1722 } 1688 }
1723 1689
1724 if (!uptodate) { 1690 if (!uptodate) {
1725 clear_extent_uptodate(tree, start, end, GFP_NOFS); 1691 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1726 ClearPageUptodate(page); 1692 ClearPageUptodate(page);
1727 SetPageError(page); 1693 SetPageError(page);
1728 } 1694 }
@@ -1750,7 +1716,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1750static void end_bio_extent_readpage(struct bio *bio, int err) 1716static void end_bio_extent_readpage(struct bio *bio, int err)
1751{ 1717{
1752 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1718 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1753 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1719 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1720 struct bio_vec *bvec = bio->bi_io_vec;
1754 struct extent_io_tree *tree; 1721 struct extent_io_tree *tree;
1755 u64 start; 1722 u64 start;
1756 u64 end; 1723 u64 end;
@@ -1773,7 +1740,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1773 else 1740 else
1774 whole_page = 0; 1741 whole_page = 0;
1775 1742
1776 if (--bvec >= bio->bi_io_vec) 1743 if (++bvec <= bvec_end)
1777 prefetchw(&bvec->bv_page->flags); 1744 prefetchw(&bvec->bv_page->flags);
1778 1745
1779 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1746 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1785,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1818 } 1785 }
1819 check_page_locked(tree, page); 1786 check_page_locked(tree, page);
1820 } 1787 }
1821 } while (bvec >= bio->bi_io_vec); 1788 } while (bvec <= bvec_end);
1822 1789
1823 bio_put(bio); 1790 bio_put(bio);
1824} 1791}
@@ -1861,9 +1828,9 @@ static void end_bio_extent_preparewrite(struct bio *bio, int err)
1861 bio_put(bio); 1828 bio_put(bio);
1862} 1829}
1863 1830
1864static struct bio * 1831struct bio *
1865extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 1832btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1866 gfp_t gfp_flags) 1833 gfp_t gfp_flags)
1867{ 1834{
1868 struct bio *bio; 1835 struct bio *bio;
1869 1836
@@ -1890,10 +1857,8 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1890 struct page *page = bvec->bv_page; 1857 struct page *page = bvec->bv_page;
1891 struct extent_io_tree *tree = bio->bi_private; 1858 struct extent_io_tree *tree = bio->bi_private;
1892 u64 start; 1859 u64 start;
1893 u64 end;
1894 1860
1895 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1861 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1896 end = start + bvec->bv_len - 1;
1897 1862
1898 bio->bi_private = NULL; 1863 bio->bi_private = NULL;
1899 1864
@@ -1901,7 +1866,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1901 1866
1902 if (tree->ops && tree->ops->submit_bio_hook) 1867 if (tree->ops && tree->ops->submit_bio_hook)
1903 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1868 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1904 mirror_num, bio_flags); 1869 mirror_num, bio_flags, start);
1905 else 1870 else
1906 submit_bio(rw, bio); 1871 submit_bio(rw, bio);
1907 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1872 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1954,7 +1919,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1954 else 1919 else
1955 nr = bio_get_nr_vecs(bdev); 1920 nr = bio_get_nr_vecs(bdev);
1956 1921
1957 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1922 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1958 1923
1959 bio_add_page(bio, page, page_size, offset); 1924 bio_add_page(bio, page, page_size, offset);
1960 bio->bi_end_io = end_io_func; 1925 bio->bi_end_io = end_io_func;
@@ -2005,6 +1970,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2005 sector_t sector; 1970 sector_t sector;
2006 struct extent_map *em; 1971 struct extent_map *em;
2007 struct block_device *bdev; 1972 struct block_device *bdev;
1973 struct btrfs_ordered_extent *ordered;
2008 int ret; 1974 int ret;
2009 int nr = 0; 1975 int nr = 0;
2010 size_t page_offset = 0; 1976 size_t page_offset = 0;
@@ -2016,7 +1982,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2016 set_page_extent_mapped(page); 1982 set_page_extent_mapped(page);
2017 1983
2018 end = page_end; 1984 end = page_end;
2019 lock_extent(tree, start, end, GFP_NOFS); 1985 while (1) {
1986 lock_extent(tree, start, end, GFP_NOFS);
1987 ordered = btrfs_lookup_ordered_extent(inode, start);
1988 if (!ordered)
1989 break;
1990 unlock_extent(tree, start, end, GFP_NOFS);
1991 btrfs_start_ordered_extent(inode, ordered, 1);
1992 btrfs_put_ordered_extent(ordered);
1993 }
2020 1994
2021 if (page->index == last_byte >> PAGE_CACHE_SHIFT) { 1995 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2022 char *userpage; 1996 char *userpage;
@@ -2054,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2054 BUG_ON(extent_map_end(em) <= cur); 2028 BUG_ON(extent_map_end(em) <= cur);
2055 BUG_ON(end < cur); 2029 BUG_ON(end < cur);
2056 2030
2057 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2058 this_bio_flag = EXTENT_BIO_COMPRESSED; 2032 this_bio_flag = EXTENT_BIO_COMPRESSED;
2033 extent_set_compress_type(&this_bio_flag,
2034 em->compress_type);
2035 }
2059 2036
2060 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2037 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2061 cur_end = min(extent_map_end(em) - 1, end); 2038 cur_end = min(extent_map_end(em) - 1, end);
@@ -2184,7 +2161,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2184 u64 last_byte = i_size_read(inode); 2161 u64 last_byte = i_size_read(inode);
2185 u64 block_start; 2162 u64 block_start;
2186 u64 iosize; 2163 u64 iosize;
2187 u64 unlock_start;
2188 sector_t sector; 2164 sector_t sector;
2189 struct extent_state *cached_state = NULL; 2165 struct extent_state *cached_state = NULL;
2190 struct extent_map *em; 2166 struct extent_map *em;
@@ -2309,7 +2285,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2309 if (tree->ops && tree->ops->writepage_end_io_hook) 2285 if (tree->ops && tree->ops->writepage_end_io_hook)
2310 tree->ops->writepage_end_io_hook(page, start, 2286 tree->ops->writepage_end_io_hook(page, start,
2311 page_end, NULL, 1); 2287 page_end, NULL, 1);
2312 unlock_start = page_end + 1;
2313 goto done; 2288 goto done;
2314 } 2289 }
2315 2290
@@ -2320,7 +2295,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2320 if (tree->ops && tree->ops->writepage_end_io_hook) 2295 if (tree->ops && tree->ops->writepage_end_io_hook)
2321 tree->ops->writepage_end_io_hook(page, cur, 2296 tree->ops->writepage_end_io_hook(page, cur,
2322 page_end, NULL, 1); 2297 page_end, NULL, 1);
2323 unlock_start = page_end + 1;
2324 break; 2298 break;
2325 } 2299 }
2326 em = epd->get_extent(inode, page, pg_offset, cur, 2300 em = epd->get_extent(inode, page, pg_offset, cur,
@@ -2367,7 +2341,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2367 2341
2368 cur += iosize; 2342 cur += iosize;
2369 pg_offset += iosize; 2343 pg_offset += iosize;
2370 unlock_start = cur;
2371 continue; 2344 continue;
2372 } 2345 }
2373 /* leave this out until we have a page_mkwrite call */ 2346 /* leave this out until we have a page_mkwrite call */
@@ -2453,7 +2426,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2453 pgoff_t index; 2426 pgoff_t index;
2454 pgoff_t end; /* Inclusive */ 2427 pgoff_t end; /* Inclusive */
2455 int scanned = 0; 2428 int scanned = 0;
2456 int range_whole = 0;
2457 2429
2458 pagevec_init(&pvec, 0); 2430 pagevec_init(&pvec, 0);
2459 if (wbc->range_cyclic) { 2431 if (wbc->range_cyclic) {
@@ -2462,8 +2434,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2462 } else { 2434 } else {
2463 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2435 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2464 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2436 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2465 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2466 range_whole = 1;
2467 scanned = 1; 2437 scanned = 1;
2468 } 2438 }
2469retry: 2439retry:
@@ -2574,7 +2544,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2574 .sync_io = wbc->sync_mode == WB_SYNC_ALL, 2544 .sync_io = wbc->sync_mode == WB_SYNC_ALL,
2575 }; 2545 };
2576 struct writeback_control wbc_writepages = { 2546 struct writeback_control wbc_writepages = {
2577 .bdi = wbc->bdi,
2578 .sync_mode = wbc->sync_mode, 2547 .sync_mode = wbc->sync_mode,
2579 .older_than_this = NULL, 2548 .older_than_this = NULL,
2580 .nr_to_write = 64, 2549 .nr_to_write = 64,
@@ -2608,7 +2577,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2608 .sync_io = mode == WB_SYNC_ALL, 2577 .sync_io = mode == WB_SYNC_ALL,
2609 }; 2578 };
2610 struct writeback_control wbc_writepages = { 2579 struct writeback_control wbc_writepages = {
2611 .bdi = inode->i_mapping->backing_dev_info,
2612 .sync_mode = mode, 2580 .sync_mode = mode,
2613 .older_than_this = NULL, 2581 .older_than_this = NULL,
2614 .nr_to_write = nr_pages * 2, 2582 .nr_to_write = nr_pages * 2,
@@ -2663,33 +2631,20 @@ int extent_readpages(struct extent_io_tree *tree,
2663{ 2631{
2664 struct bio *bio = NULL; 2632 struct bio *bio = NULL;
2665 unsigned page_idx; 2633 unsigned page_idx;
2666 struct pagevec pvec;
2667 unsigned long bio_flags = 0; 2634 unsigned long bio_flags = 0;
2668 2635
2669 pagevec_init(&pvec, 0);
2670 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2636 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2671 struct page *page = list_entry(pages->prev, struct page, lru); 2637 struct page *page = list_entry(pages->prev, struct page, lru);
2672 2638
2673 prefetchw(&page->flags); 2639 prefetchw(&page->flags);
2674 list_del(&page->lru); 2640 list_del(&page->lru);
2675 /* 2641 if (!add_to_page_cache_lru(page, mapping,
2676 * what we want to do here is call add_to_page_cache_lru,
2677 * but that isn't exported, so we reproduce it here
2678 */
2679 if (!add_to_page_cache(page, mapping,
2680 page->index, GFP_KERNEL)) { 2642 page->index, GFP_KERNEL)) {
2681
2682 /* open coding of lru_cache_add, also not exported */
2683 page_cache_get(page);
2684 if (!pagevec_add(&pvec, page))
2685 __pagevec_lru_add_file(&pvec);
2686 __extent_read_full_page(tree, page, get_extent, 2643 __extent_read_full_page(tree, page, get_extent,
2687 &bio, 0, &bio_flags); 2644 &bio, 0, &bio_flags);
2688 } 2645 }
2689 page_cache_release(page); 2646 page_cache_release(page);
2690 } 2647 }
2691 if (pagevec_count(&pvec))
2692 __pagevec_lru_add_file(&pvec);
2693 BUG_ON(!list_empty(pages)); 2648 BUG_ON(!list_empty(pages));
2694 if (bio) 2649 if (bio)
2695 submit_one_bio(READ, bio, 0, bio_flags); 2650 submit_one_bio(READ, bio, 0, bio_flags);
@@ -2704,6 +2659,7 @@ int extent_readpages(struct extent_io_tree *tree,
2704int extent_invalidatepage(struct extent_io_tree *tree, 2659int extent_invalidatepage(struct extent_io_tree *tree,
2705 struct page *page, unsigned long offset) 2660 struct page *page, unsigned long offset)
2706{ 2661{
2662 struct extent_state *cached_state = NULL;
2707 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2663 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2708 u64 end = start + PAGE_CACHE_SIZE - 1; 2664 u64 end = start + PAGE_CACHE_SIZE - 1;
2709 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2665 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2668,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2712 if (start > end) 2668 if (start > end)
2713 return 0; 2669 return 0;
2714 2670
2715 lock_extent(tree, start, end, GFP_NOFS); 2671 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2716 wait_on_page_writeback(page); 2672 wait_on_page_writeback(page);
2717 clear_extent_bit(tree, start, end, 2673 clear_extent_bit(tree, start, end,
2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2674 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING, 2675 EXTENT_DO_ACCOUNTING,
2720 1, 1, NULL, GFP_NOFS); 2676 1, 1, &cached_state, GFP_NOFS);
2721 return 0; 2677 return 0;
2722} 2678}
2723 2679
@@ -2817,6 +2773,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
2817 NULL, 1, 2773 NULL, 1,
2818 end_bio_extent_preparewrite, 0, 2774 end_bio_extent_preparewrite, 0,
2819 0, 0); 2775 0, 0);
2776 if (ret && !err)
2777 err = ret;
2820 iocount++; 2778 iocount++;
2821 block_start = block_start + iosize; 2779 block_start = block_start + iosize;
2822 } else { 2780 } else {
@@ -2920,16 +2878,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2920 get_extent_t *get_extent) 2878 get_extent_t *get_extent)
2921{ 2879{
2922 struct inode *inode = mapping->host; 2880 struct inode *inode = mapping->host;
2881 struct extent_state *cached_state = NULL;
2923 u64 start = iblock << inode->i_blkbits; 2882 u64 start = iblock << inode->i_blkbits;
2924 sector_t sector = 0; 2883 sector_t sector = 0;
2925 size_t blksize = (1 << inode->i_blkbits); 2884 size_t blksize = (1 << inode->i_blkbits);
2926 struct extent_map *em; 2885 struct extent_map *em;
2927 2886
2928 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2887 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2929 GFP_NOFS); 2888 0, &cached_state, GFP_NOFS);
2930 em = get_extent(inode, NULL, 0, start, blksize, 0); 2889 em = get_extent(inode, NULL, 0, start, blksize, 0);
2931 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2890 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2932 GFP_NOFS); 2891 start + blksize - 1, &cached_state, GFP_NOFS);
2933 if (!em || IS_ERR(em)) 2892 if (!em || IS_ERR(em))
2934 return 0; 2893 return 0;
2935 2894
@@ -2945,22 +2904,55 @@ out:
2945int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2904int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2946 __u64 start, __u64 len, get_extent_t *get_extent) 2905 __u64 start, __u64 len, get_extent_t *get_extent)
2947{ 2906{
2948 int ret; 2907 int ret = 0;
2949 u64 off = start; 2908 u64 off = start;
2950 u64 max = start + len; 2909 u64 max = start + len;
2951 u32 flags = 0; 2910 u32 flags = 0;
2911 u32 found_type;
2912 u64 last;
2952 u64 disko = 0; 2913 u64 disko = 0;
2914 struct btrfs_key found_key;
2953 struct extent_map *em = NULL; 2915 struct extent_map *em = NULL;
2916 struct extent_state *cached_state = NULL;
2917 struct btrfs_path *path;
2918 struct btrfs_file_extent_item *item;
2954 int end = 0; 2919 int end = 0;
2955 u64 em_start = 0, em_len = 0; 2920 u64 em_start = 0, em_len = 0;
2956 unsigned long emflags; 2921 unsigned long emflags;
2957 ret = 0; 2922 int hole = 0;
2958 2923
2959 if (len == 0) 2924 if (len == 0)
2960 return -EINVAL; 2925 return -EINVAL;
2961 2926
2962 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2927 path = btrfs_alloc_path();
2963 GFP_NOFS); 2928 if (!path)
2929 return -ENOMEM;
2930 path->leave_spinning = 1;
2931
2932 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2933 path, inode->i_ino, -1, 0);
2934 if (ret < 0) {
2935 btrfs_free_path(path);
2936 return ret;
2937 }
2938 WARN_ON(!ret);
2939 path->slots[0]--;
2940 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2941 struct btrfs_file_extent_item);
2942 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2943 found_type = btrfs_key_type(&found_key);
2944
2945 /* No extents, just return */
2946 if (found_key.objectid != inode->i_ino ||
2947 found_type != BTRFS_EXTENT_DATA_KEY) {
2948 btrfs_free_path(path);
2949 return 0;
2950 }
2951 last = found_key.offset;
2952 btrfs_free_path(path);
2953
2954 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2955 &cached_state, GFP_NOFS);
2964 em = get_extent(inode, NULL, 0, off, max - off, 0); 2956 em = get_extent(inode, NULL, 0, off, max - off, 0);
2965 if (!em) 2957 if (!em)
2966 goto out; 2958 goto out;
@@ -2968,11 +2960,18 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2968 ret = PTR_ERR(em); 2960 ret = PTR_ERR(em);
2969 goto out; 2961 goto out;
2970 } 2962 }
2963
2971 while (!end) { 2964 while (!end) {
2965 hole = 0;
2972 off = em->start + em->len; 2966 off = em->start + em->len;
2973 if (off >= max) 2967 if (off >= max)
2974 end = 1; 2968 end = 1;
2975 2969
2970 if (em->block_start == EXTENT_MAP_HOLE) {
2971 hole = 1;
2972 goto next;
2973 }
2974
2976 em_start = em->start; 2975 em_start = em->start;
2977 em_len = em->len; 2976 em_len = em->len;
2978 2977
@@ -2982,8 +2981,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2982 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2981 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2983 end = 1; 2982 end = 1;
2984 flags |= FIEMAP_EXTENT_LAST; 2983 flags |= FIEMAP_EXTENT_LAST;
2985 } else if (em->block_start == EXTENT_MAP_HOLE) {
2986 flags |= FIEMAP_EXTENT_UNWRITTEN;
2987 } else if (em->block_start == EXTENT_MAP_INLINE) { 2984 } else if (em->block_start == EXTENT_MAP_INLINE) {
2988 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2985 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2989 FIEMAP_EXTENT_NOT_ALIGNED); 2986 FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2996,10 +2993,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2996 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2993 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2997 flags |= FIEMAP_EXTENT_ENCODED; 2994 flags |= FIEMAP_EXTENT_ENCODED;
2998 2995
2996next:
2999 emflags = em->flags; 2997 emflags = em->flags;
3000 free_extent_map(em); 2998 free_extent_map(em);
3001 em = NULL; 2999 em = NULL;
3002
3003 if (!end) { 3000 if (!end) {
3004 em = get_extent(inode, NULL, 0, off, max - off, 0); 3001 em = get_extent(inode, NULL, 0, off, max - off, 0);
3005 if (!em) 3002 if (!em)
@@ -3010,21 +3007,29 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3010 } 3007 }
3011 emflags = em->flags; 3008 emflags = em->flags;
3012 } 3009 }
3010
3013 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 3011 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
3014 flags |= FIEMAP_EXTENT_LAST; 3012 flags |= FIEMAP_EXTENT_LAST;
3015 end = 1; 3013 end = 1;
3016 } 3014 }
3017 3015
3018 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 3016 if (em_start == last) {
3019 em_len, flags); 3017 flags |= FIEMAP_EXTENT_LAST;
3020 if (ret) 3018 end = 1;
3021 goto out_free; 3019 }
3020
3021 if (!hole) {
3022 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3023 em_len, flags);
3024 if (ret)
3025 goto out_free;
3026 }
3022 } 3027 }
3023out_free: 3028out_free:
3024 free_extent_map(em); 3029 free_extent_map(em);
3025out: 3030out:
3026 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 3031 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3027 GFP_NOFS); 3032 &cached_state, GFP_NOFS);
3028 return ret; 3033 return ret;
3029} 3034}
3030 3035
@@ -3070,6 +3075,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3070#endif 3075#endif
3071 3076
3072 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3077 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3078 if (eb == NULL)
3079 return NULL;
3073 eb->start = start; 3080 eb->start = start;
3074 eb->len = len; 3081 eb->len = len;
3075 spin_lock_init(&eb->lock); 3082 spin_lock_init(&eb->lock);
@@ -3096,6 +3103,39 @@ static void __free_extent_buffer(struct extent_buffer *eb)
3096 kmem_cache_free(extent_buffer_cache, eb); 3103 kmem_cache_free(extent_buffer_cache, eb);
3097} 3104}
3098 3105
3106/*
3107 * Helper for releasing extent buffer page.
3108 */
3109static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3110 unsigned long start_idx)
3111{
3112 unsigned long index;
3113 struct page *page;
3114
3115 if (!eb->first_page)
3116 return;
3117
3118 index = num_extent_pages(eb->start, eb->len);
3119 if (start_idx >= index)
3120 return;
3121
3122 do {
3123 index--;
3124 page = extent_buffer_page(eb, index);
3125 if (page)
3126 page_cache_release(page);
3127 } while (index != start_idx);
3128}
3129
3130/*
3131 * Helper for releasing the extent buffer.
3132 */
3133static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3134{
3135 btrfs_release_extent_buffer_page(eb, 0);
3136 __free_extent_buffer(eb);
3137}
3138
3099struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3139struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3100 u64 start, unsigned long len, 3140 u64 start, unsigned long len,
3101 struct page *page0, 3141 struct page *page0,
@@ -3109,16 +3149,16 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3109 struct page *p; 3149 struct page *p;
3110 struct address_space *mapping = tree->mapping; 3150 struct address_space *mapping = tree->mapping;
3111 int uptodate = 1; 3151 int uptodate = 1;
3152 int ret;
3112 3153
3113 spin_lock(&tree->buffer_lock); 3154 rcu_read_lock();
3114 eb = buffer_search(tree, start); 3155 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3115 if (eb) { 3156 if (eb && atomic_inc_not_zero(&eb->refs)) {
3116 atomic_inc(&eb->refs); 3157 rcu_read_unlock();
3117 spin_unlock(&tree->buffer_lock);
3118 mark_page_accessed(eb->first_page); 3158 mark_page_accessed(eb->first_page);
3119 return eb; 3159 return eb;
3120 } 3160 }
3121 spin_unlock(&tree->buffer_lock); 3161 rcu_read_unlock();
3122 3162
3123 eb = __alloc_extent_buffer(tree, start, len, mask); 3163 eb = __alloc_extent_buffer(tree, start, len, mask);
3124 if (!eb) 3164 if (!eb)
@@ -3157,27 +3197,31 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3157 if (uptodate) 3197 if (uptodate)
3158 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3198 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3159 3199
3200 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3201 if (ret)
3202 goto free_eb;
3203
3160 spin_lock(&tree->buffer_lock); 3204 spin_lock(&tree->buffer_lock);
3161 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3205 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3162 if (exists) { 3206 if (ret == -EEXIST) {
3207 exists = radix_tree_lookup(&tree->buffer,
3208 start >> PAGE_CACHE_SHIFT);
3163 /* add one reference for the caller */ 3209 /* add one reference for the caller */
3164 atomic_inc(&exists->refs); 3210 atomic_inc(&exists->refs);
3165 spin_unlock(&tree->buffer_lock); 3211 spin_unlock(&tree->buffer_lock);
3212 radix_tree_preload_end();
3166 goto free_eb; 3213 goto free_eb;
3167 } 3214 }
3168 spin_unlock(&tree->buffer_lock);
3169
3170 /* add one reference for the tree */ 3215 /* add one reference for the tree */
3171 atomic_inc(&eb->refs); 3216 atomic_inc(&eb->refs);
3217 spin_unlock(&tree->buffer_lock);
3218 radix_tree_preload_end();
3172 return eb; 3219 return eb;
3173 3220
3174free_eb: 3221free_eb:
3175 if (!atomic_dec_and_test(&eb->refs)) 3222 if (!atomic_dec_and_test(&eb->refs))
3176 return exists; 3223 return exists;
3177 for (index = 1; index < i; index++) 3224 btrfs_release_extent_buffer(eb);
3178 page_cache_release(extent_buffer_page(eb, index));
3179 page_cache_release(extent_buffer_page(eb, 0));
3180 __free_extent_buffer(eb);
3181 return exists; 3225 return exists;
3182} 3226}
3183 3227
@@ -3187,16 +3231,16 @@ struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3187{ 3231{
3188 struct extent_buffer *eb; 3232 struct extent_buffer *eb;
3189 3233
3190 spin_lock(&tree->buffer_lock); 3234 rcu_read_lock();
3191 eb = buffer_search(tree, start); 3235 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3192 if (eb) 3236 if (eb && atomic_inc_not_zero(&eb->refs)) {
3193 atomic_inc(&eb->refs); 3237 rcu_read_unlock();
3194 spin_unlock(&tree->buffer_lock);
3195
3196 if (eb)
3197 mark_page_accessed(eb->first_page); 3238 mark_page_accessed(eb->first_page);
3239 return eb;
3240 }
3241 rcu_read_unlock();
3198 3242
3199 return eb; 3243 return NULL;
3200} 3244}
3201 3245
3202void free_extent_buffer(struct extent_buffer *eb) 3246void free_extent_buffer(struct extent_buffer *eb)
@@ -3265,7 +3309,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3265} 3309}
3266 3310
3267int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3311int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3268 struct extent_buffer *eb) 3312 struct extent_buffer *eb,
3313 struct extent_state **cached_state)
3269{ 3314{
3270 unsigned long i; 3315 unsigned long i;
3271 struct page *page; 3316 struct page *page;
@@ -3275,7 +3320,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3275 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3320 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3276 3321
3277 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3322 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3278 GFP_NOFS); 3323 cached_state, GFP_NOFS);
3279 for (i = 0; i < num_pages; i++) { 3324 for (i = 0; i < num_pages; i++) {
3280 page = extent_buffer_page(eb, i); 3325 page = extent_buffer_page(eb, i);
3281 if (page) 3326 if (page)
@@ -3335,7 +3380,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3335} 3380}
3336 3381
3337int extent_buffer_uptodate(struct extent_io_tree *tree, 3382int extent_buffer_uptodate(struct extent_io_tree *tree,
3338 struct extent_buffer *eb) 3383 struct extent_buffer *eb,
3384 struct extent_state *cached_state)
3339{ 3385{
3340 int ret = 0; 3386 int ret = 0;
3341 unsigned long num_pages; 3387 unsigned long num_pages;
@@ -3347,7 +3393,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3347 return 1; 3393 return 1;
3348 3394
3349 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3395 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3350 EXTENT_UPTODATE, 1, NULL); 3396 EXTENT_UPTODATE, 1, cached_state);
3351 if (ret) 3397 if (ret)
3352 return ret; 3398 return ret;
3353 3399
@@ -3824,34 +3870,47 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3824 } 3870 }
3825} 3871}
3826 3872
3873static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3874{
3875 struct extent_buffer *eb =
3876 container_of(head, struct extent_buffer, rcu_head);
3877
3878 btrfs_release_extent_buffer(eb);
3879}
3880
3827int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3881int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3828{ 3882{
3829 u64 start = page_offset(page); 3883 u64 start = page_offset(page);
3830 struct extent_buffer *eb; 3884 struct extent_buffer *eb;
3831 int ret = 1; 3885 int ret = 1;
3832 unsigned long i;
3833 unsigned long num_pages;
3834 3886
3835 spin_lock(&tree->buffer_lock); 3887 spin_lock(&tree->buffer_lock);
3836 eb = buffer_search(tree, start); 3888 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3837 if (!eb) 3889 if (!eb) {
3838 goto out; 3890 spin_unlock(&tree->buffer_lock);
3891 return ret;
3892 }
3839 3893
3840 if (atomic_read(&eb->refs) > 1) { 3894 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3841 ret = 0; 3895 ret = 0;
3842 goto out; 3896 goto out;
3843 } 3897 }
3844 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3898
3899 /*
3900 * set @eb->refs to 0 if it is already 1, and then release the @eb.
3901 * Or go back.
3902 */
3903 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
3845 ret = 0; 3904 ret = 0;
3846 goto out; 3905 goto out;
3847 } 3906 }
3848 /* at this point we can safely release the extent buffer */ 3907
3849 num_pages = num_extent_pages(eb->start, eb->len); 3908 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3850 for (i = 0; i < num_pages; i++)
3851 page_cache_release(extent_buffer_page(eb, i));
3852 rb_erase(&eb->rb_node, &tree->buffer);
3853 __free_extent_buffer(eb);
3854out: 3909out:
3855 spin_unlock(&tree->buffer_lock); 3910 spin_unlock(&tree->buffer_lock);
3911
3912 /* at this point we can safely release the extent buffer */
3913 if (atomic_read(&eb->refs) == 0)
3914 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
3856 return ret; 3915 return ret;
3857} 3916}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..7083cfafd061 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,10 +16,16 @@
16#define EXTENT_BOUNDARY (1 << 9) 16#define EXTENT_BOUNDARY (1 << 9)
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12)
19#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
20 22
21/* flags for bio submission */ 23/*
24 * flags for bio submission. The high bits indicate the compression
25 * type for this bio
26 */
22#define EXTENT_BIO_COMPRESSED 1 27#define EXTENT_BIO_COMPRESSED 1
28#define EXTENT_BIO_FLAG_SHIFT 16
23 29
24/* these are bit numbers for test/set bit */ 30/* these are bit numbers for test/set bit */
25#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
@@ -47,7 +53,7 @@ struct extent_state;
47 53
48typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, 54typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
49 struct bio *bio, int mirror_num, 55 struct bio *bio, int mirror_num,
50 unsigned long bio_flags); 56 unsigned long bio_flags, u64 bio_offset);
51struct extent_io_ops { 57struct extent_io_ops {
52 int (*fill_delalloc)(struct inode *inode, struct page *locked_page, 58 int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
53 u64 start, u64 end, int *page_started, 59 u64 start, u64 end, int *page_started,
@@ -69,10 +75,10 @@ struct extent_io_ops {
69 struct extent_state *state); 75 struct extent_state *state);
70 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, 76 int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
71 struct extent_state *state, int uptodate); 77 struct extent_state *state, int uptodate);
72 int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, 78 int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
73 unsigned long old, unsigned long bits); 79 int *bits);
74 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state, 80 int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
75 unsigned long bits); 81 int *bits);
76 int (*merge_extent_hook)(struct inode *inode, 82 int (*merge_extent_hook)(struct inode *inode,
77 struct extent_state *new, 83 struct extent_state *new,
78 struct extent_state *other); 84 struct extent_state *other);
@@ -83,7 +89,7 @@ struct extent_io_ops {
83 89
84struct extent_io_tree { 90struct extent_io_tree {
85 struct rb_root state; 91 struct rb_root state;
86 struct rb_root buffer; 92 struct radix_tree_root buffer;
87 struct address_space *mapping; 93 struct address_space *mapping;
88 u64 dirty_bytes; 94 u64 dirty_bytes;
89 spinlock_t lock; 95 spinlock_t lock;
@@ -121,7 +127,7 @@ struct extent_buffer {
121 unsigned long bflags; 127 unsigned long bflags;
122 atomic_t refs; 128 atomic_t refs;
123 struct list_head leak_list; 129 struct list_head leak_list;
124 struct rb_node rb_node; 130 struct rcu_head rcu_head;
125 131
126 /* the spinlock is used to protect most operations */ 132 /* the spinlock is used to protect most operations */
127 spinlock_t lock; 133 spinlock_t lock;
@@ -133,6 +139,17 @@ struct extent_buffer {
133 wait_queue_head_t lock_wq; 139 wait_queue_head_t lock_wq;
134}; 140};
135 141
142static inline void extent_set_compress_type(unsigned long *bio_flags,
143 int compress_type)
144{
145 *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
146}
147
148static inline int extent_compress_type(unsigned long bio_flags)
149{
150 return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
151}
152
136struct extent_map_tree; 153struct extent_map_tree;
137 154
138static inline struct extent_state *extent_state_next(struct extent_state *state) 155static inline struct extent_state *extent_state_next(struct extent_state *state)
@@ -163,6 +180,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 180int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, struct extent_state **cached, gfp_t mask); 181 int bits, struct extent_state **cached, gfp_t mask);
165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 182int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
183int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
184 struct extent_state **cached, gfp_t mask);
166int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask); 186 gfp_t mask);
168int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 187int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -174,6 +193,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
174 u64 *start, u64 search_end, 193 u64 *start, u64 search_end,
175 u64 max_bytes, unsigned long bits); 194 u64 max_bytes, unsigned long bits);
176 195
196void free_extent_state(struct extent_state *state);
177int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 197int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
178 int bits, int filled, struct extent_state *cached_state); 198 int bits, int filled, struct extent_state *cached_state);
179int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 199int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -183,6 +203,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
183 gfp_t mask); 203 gfp_t mask);
184int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 204int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
185 int bits, gfp_t mask); 205 int bits, gfp_t mask);
206int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
207 int bits, int exclusive_bits, u64 *failed_start,
208 struct extent_state **cached_state, gfp_t mask);
186int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 209int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
187 gfp_t mask); 210 gfp_t mask);
188int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 211int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -196,7 +219,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
196int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, 219int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
197 u64 end, gfp_t mask); 220 u64 end, gfp_t mask);
198int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 221int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
199 gfp_t mask); 222 struct extent_state **cached_state, gfp_t mask);
200int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
201 gfp_t mask); 224 gfp_t mask);
202int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +304,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
281int set_extent_buffer_uptodate(struct extent_io_tree *tree, 304int set_extent_buffer_uptodate(struct extent_io_tree *tree,
282 struct extent_buffer *eb); 305 struct extent_buffer *eb);
283int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 306int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
284 struct extent_buffer *eb); 307 struct extent_buffer *eb,
308 struct extent_state **cached_state);
285int extent_buffer_uptodate(struct extent_io_tree *tree, 309int extent_buffer_uptodate(struct extent_io_tree *tree,
286 struct extent_buffer *eb); 310 struct extent_buffer *eb,
311 struct extent_state *cached_state);
287int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, 312int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
288 unsigned long min_len, char **token, char **map, 313 unsigned long min_len, char **token, char **map,
289 unsigned long *map_start, 314 unsigned long *map_start,
@@ -300,4 +325,7 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300 struct extent_io_tree *tree, 325 struct extent_io_tree *tree,
301 u64 start, u64 end, struct page *locked_page, 326 u64 start, u64 end, struct page *locked_page,
302 unsigned long op); 327 unsigned long op);
328struct bio *
329btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
330 gfp_t gfp_flags);
303#endif 331#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 428fcac45f90..b0e1fce12530 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,9 +1,9 @@
1#include <linux/err.h> 1#include <linux/err.h>
2#include <linux/gfp.h>
3#include <linux/slab.h> 2#include <linux/slab.h>
4#include <linux/module.h> 3#include <linux/module.h>
5#include <linux/spinlock.h> 4#include <linux/spinlock.h>
6#include <linux/hardirq.h> 5#include <linux/hardirq.h>
6#include "ctree.h"
7#include "extent_map.h" 7#include "extent_map.h"
8 8
9 9
@@ -35,7 +35,7 @@ void extent_map_exit(void)
35 */ 35 */
36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) 36void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
37{ 37{
38 tree->map.rb_node = NULL; 38 tree->map = RB_ROOT;
39 rwlock_init(&tree->lock); 39 rwlock_init(&tree->lock);
40} 40}
41 41
@@ -55,6 +55,7 @@ struct extent_map *alloc_extent_map(gfp_t mask)
55 return em; 55 return em;
56 em->in_tree = 0; 56 em->in_tree = 0;
57 em->flags = 0; 57 em->flags = 0;
58 em->compress_type = BTRFS_COMPRESS_NONE;
58 atomic_set(&em->refs, 1); 59 atomic_set(&em->refs, 1);
59 return em; 60 return em;
60} 61}
@@ -336,7 +337,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
336 goto out; 337 goto out;
337 } 338 }
338 if (IS_ERR(rb_node)) { 339 if (IS_ERR(rb_node)) {
339 em = ERR_PTR(PTR_ERR(rb_node)); 340 em = ERR_CAST(rb_node);
340 goto out; 341 goto out;
341 } 342 }
342 em = rb_entry(rb_node, struct extent_map, rb_node); 343 em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -385,7 +386,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
385 goto out; 386 goto out;
386 } 387 }
387 if (IS_ERR(rb_node)) { 388 if (IS_ERR(rb_node)) {
388 em = ERR_PTR(PTR_ERR(rb_node)); 389 em = ERR_CAST(rb_node);
389 goto out; 390 goto out;
390 } 391 }
391 em = rb_entry(rb_node, struct extent_map, rb_node); 392 em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..28b44dbd1e35 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
26 unsigned long flags; 26 unsigned long flags;
27 struct block_device *bdev; 27 struct block_device *bdev;
28 atomic_t refs; 28 atomic_t refs;
29 int in_tree; 29 unsigned int in_tree:1;
30 unsigned int compress_type:4;
30}; 31};
31 32
32struct extent_map_tree { 33struct extent_map_tree {
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 9b99886562d0..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/pagemap.h> 21#include <linux/pagemap.h>
21#include <linux/highmem.h> 22#include <linux/highmem.h>
22#include "ctree.h" 23#include "ctree.h"
@@ -148,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
148} 149}
149 150
150 151
151int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, 152static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
152 struct bio *bio, u32 *dst) 153 struct inode *inode, struct bio *bio,
154 u64 logical_offset, u32 *dst, int dio)
153{ 155{
154 u32 sum; 156 u32 sum;
155 struct bio_vec *bvec = bio->bi_io_vec; 157 struct bio_vec *bvec = bio->bi_io_vec;
156 int bio_index = 0; 158 int bio_index = 0;
157 u64 offset; 159 u64 offset = 0;
158 u64 item_start_offset = 0; 160 u64 item_start_offset = 0;
159 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
160 u64 disk_bytenr; 162 u64 disk_bytenr;
@@ -173,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
173 WARN_ON(bio->bi_vcnt <= 0); 175 WARN_ON(bio->bi_vcnt <= 0);
174 176
175 disk_bytenr = (u64)bio->bi_sector << 9; 177 disk_bytenr = (u64)bio->bi_sector << 9;
178 if (dio)
179 offset = logical_offset;
176 while (bio_index < bio->bi_vcnt) { 180 while (bio_index < bio->bi_vcnt) {
177 offset = page_offset(bvec->bv_page) + bvec->bv_offset; 181 if (!dio)
182 offset = page_offset(bvec->bv_page) + bvec->bv_offset;
178 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); 183 ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
179 if (ret == 0) 184 if (ret == 0)
180 goto found; 185 goto found;
@@ -237,6 +242,7 @@ found:
237 else 242 else
238 set_state_private(io_tree, offset, sum); 243 set_state_private(io_tree, offset, sum);
239 disk_bytenr += bvec->bv_len; 244 disk_bytenr += bvec->bv_len;
245 offset += bvec->bv_len;
240 bio_index++; 246 bio_index++;
241 bvec++; 247 bvec++;
242 } 248 }
@@ -244,6 +250,18 @@ found:
244 return 0; 250 return 0;
245} 251}
246 252
253int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
254 struct bio *bio, u32 *dst)
255{
256 return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
257}
258
259int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
260 struct bio *bio, u64 offset, u32 *dst)
261{
262 return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
263}
264
247int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
248 struct list_head *list) 266 struct list_head *list)
249{ 267{
@@ -656,6 +674,9 @@ again:
656 goto found; 674 goto found;
657 } 675 }
658 ret = PTR_ERR(item); 676 ret = PTR_ERR(item);
677 if (ret != -EFBIG && ret != -ENOENT)
678 goto fail_unlock;
679
659 if (ret == -EFBIG) { 680 if (ret == -EFBIG) {
660 u32 item_size; 681 u32 item_size;
661 /* we found one, but it isn't big enough yet */ 682 /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c02033596f02..c800d58f3013 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,10 +24,12 @@
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/falloc.h>
27#include <linux/swap.h> 28#include <linux/swap.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
29#include <linux/statfs.h> 30#include <linux/statfs.h>
30#include <linux/compat.h> 31#include <linux/compat.h>
32#include <linux/slab.h>
31#include "ctree.h" 33#include "ctree.h"
32#include "disk-io.h" 34#include "disk-io.h"
33#include "transaction.h" 35#include "transaction.h"
@@ -45,32 +47,46 @@
45static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 47static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
46 int write_bytes, 48 int write_bytes,
47 struct page **prepared_pages, 49 struct page **prepared_pages,
48 const char __user *buf) 50 struct iov_iter *i)
49{ 51{
50 long page_fault = 0; 52 size_t copied = 0;
51 int i; 53 int pg = 0;
52 int offset = pos & (PAGE_CACHE_SIZE - 1); 54 int offset = pos & (PAGE_CACHE_SIZE - 1);
55 int total_copied = 0;
53 56
54 for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { 57 while (write_bytes > 0) {
55 size_t count = min_t(size_t, 58 size_t count = min_t(size_t,
56 PAGE_CACHE_SIZE - offset, write_bytes); 59 PAGE_CACHE_SIZE - offset, write_bytes);
57 struct page *page = prepared_pages[i]; 60 struct page *page = prepared_pages[pg];
58 fault_in_pages_readable(buf, count); 61 /*
62 * Copy data from userspace to the current page
63 *
64 * Disable pagefault to avoid recursive lock since
65 * the pages are already locked
66 */
67 pagefault_disable();
68 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
69 pagefault_enable();
59 70
60 /* Copy data from userspace to the current page */
61 kmap(page);
62 page_fault = __copy_from_user(page_address(page) + offset,
63 buf, count);
64 /* Flush processor's dcache for this page */ 71 /* Flush processor's dcache for this page */
65 flush_dcache_page(page); 72 flush_dcache_page(page);
66 kunmap(page); 73 iov_iter_advance(i, copied);
67 buf += count; 74 write_bytes -= copied;
68 write_bytes -= count; 75 total_copied += copied;
69 76
70 if (page_fault) 77 /* Return to btrfs_file_aio_write to fault page */
78 if (unlikely(copied == 0)) {
71 break; 79 break;
80 }
81
82 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
83 offset += copied;
84 } else {
85 pg++;
86 offset = 0;
87 }
72 } 88 }
73 return page_fault ? -EFAULT : 0; 89 return total_copied;
74} 90}
75 91
76/* 92/*
@@ -123,9 +139,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 139 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 140
125 end_of_last_block = start_pos + num_bytes - 1; 141 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 142 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
127 if (err) 143 NULL);
128 return err; 144 BUG_ON(err);
129 145
130 for (i = 0; i < num_pages; i++) { 146 for (i = 0; i < num_pages; i++) {
131 struct page *p = pages[i]; 147 struct page *p = pages[i];
@@ -140,7 +156,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
140 * at this time. 156 * at this time.
141 */ 157 */
142 } 158 }
143 return err; 159 return 0;
144} 160}
145 161
146/* 162/*
@@ -209,6 +225,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
209 225
210 split->bdev = em->bdev; 226 split->bdev = em->bdev;
211 split->flags = flags; 227 split->flags = flags;
228 split->compress_type = em->compress_type;
212 ret = add_extent_mapping(em_tree, split); 229 ret = add_extent_mapping(em_tree, split);
213 BUG_ON(ret); 230 BUG_ON(ret);
214 free_extent_map(split); 231 free_extent_map(split);
@@ -223,6 +240,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
223 split->len = em->start + em->len - (start + len); 240 split->len = em->start + em->len - (start + len);
224 split->bdev = em->bdev; 241 split->bdev = em->bdev;
225 split->flags = flags; 242 split->flags = flags;
243 split->compress_type = em->compress_type;
226 244
227 if (compressed) { 245 if (compressed) {
228 split->block_len = em->block_len; 246 split->block_len = em->block_len;
@@ -720,13 +738,15 @@ again:
720 inode->i_ino, orig_offset); 738 inode->i_ino, orig_offset);
721 BUG_ON(ret); 739 BUG_ON(ret);
722 } 740 }
723 fi = btrfs_item_ptr(leaf, path->slots[0],
724 struct btrfs_file_extent_item);
725 if (del_nr == 0) { 741 if (del_nr == 0) {
742 fi = btrfs_item_ptr(leaf, path->slots[0],
743 struct btrfs_file_extent_item);
726 btrfs_set_file_extent_type(leaf, fi, 744 btrfs_set_file_extent_type(leaf, fi,
727 BTRFS_FILE_EXTENT_REG); 745 BTRFS_FILE_EXTENT_REG);
728 btrfs_mark_buffer_dirty(leaf); 746 btrfs_mark_buffer_dirty(leaf);
729 } else { 747 } else {
748 fi = btrfs_item_ptr(leaf, del_slot - 1,
749 struct btrfs_file_extent_item);
730 btrfs_set_file_extent_type(leaf, fi, 750 btrfs_set_file_extent_type(leaf, fi,
731 BTRFS_FILE_EXTENT_REG); 751 BTRFS_FILE_EXTENT_REG);
732 btrfs_set_file_extent_num_bytes(leaf, fi, 752 btrfs_set_file_extent_num_bytes(leaf, fi,
@@ -751,6 +771,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
751 loff_t pos, unsigned long first_index, 771 loff_t pos, unsigned long first_index,
752 unsigned long last_index, size_t write_bytes) 772 unsigned long last_index, size_t write_bytes)
753{ 773{
774 struct extent_state *cached_state = NULL;
754 int i; 775 int i;
755 unsigned long index = pos >> PAGE_CACHE_SHIFT; 776 unsigned long index = pos >> PAGE_CACHE_SHIFT;
756 struct inode *inode = fdentry(file)->d_inode; 777 struct inode *inode = fdentry(file)->d_inode;
@@ -779,16 +800,18 @@ again:
779 } 800 }
780 if (start_pos < inode->i_size) { 801 if (start_pos < inode->i_size) {
781 struct btrfs_ordered_extent *ordered; 802 struct btrfs_ordered_extent *ordered;
782 lock_extent(&BTRFS_I(inode)->io_tree, 803 lock_extent_bits(&BTRFS_I(inode)->io_tree,
783 start_pos, last_pos - 1, GFP_NOFS); 804 start_pos, last_pos - 1, 0, &cached_state,
805 GFP_NOFS);
784 ordered = btrfs_lookup_first_ordered_extent(inode, 806 ordered = btrfs_lookup_first_ordered_extent(inode,
785 last_pos - 1); 807 last_pos - 1);
786 if (ordered && 808 if (ordered &&
787 ordered->file_offset + ordered->len > start_pos && 809 ordered->file_offset + ordered->len > start_pos &&
788 ordered->file_offset < last_pos) { 810 ordered->file_offset < last_pos) {
789 btrfs_put_ordered_extent(ordered); 811 btrfs_put_ordered_extent(ordered);
790 unlock_extent(&BTRFS_I(inode)->io_tree, 812 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
791 start_pos, last_pos - 1, GFP_NOFS); 813 start_pos, last_pos - 1,
814 &cached_state, GFP_NOFS);
792 for (i = 0; i < num_pages; i++) { 815 for (i = 0; i < num_pages; i++) {
793 unlock_page(pages[i]); 816 unlock_page(pages[i]);
794 page_cache_release(pages[i]); 817 page_cache_release(pages[i]);
@@ -800,12 +823,13 @@ again:
800 if (ordered) 823 if (ordered)
801 btrfs_put_ordered_extent(ordered); 824 btrfs_put_ordered_extent(ordered);
802 825
803 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 826 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
804 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 827 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
805 EXTENT_DO_ACCOUNTING, 828 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
806 GFP_NOFS); 829 GFP_NOFS);
807 unlock_extent(&BTRFS_I(inode)->io_tree, 830 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
808 start_pos, last_pos - 1, GFP_NOFS); 831 start_pos, last_pos - 1, &cached_state,
832 GFP_NOFS);
809 } 833 }
810 for (i = 0; i < num_pages; i++) { 834 for (i = 0; i < num_pages; i++) {
811 clear_page_dirty_for_io(pages[i]); 835 clear_page_dirty_for_io(pages[i]);
@@ -815,45 +839,48 @@ again:
815 return 0; 839 return 0;
816} 840}
817 841
818static ssize_t btrfs_file_write(struct file *file, const char __user *buf, 842static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
819 size_t count, loff_t *ppos) 843 const struct iovec *iov,
844 unsigned long nr_segs, loff_t pos)
820{ 845{
821 loff_t pos; 846 struct file *file = iocb->ki_filp;
847 struct inode *inode = fdentry(file)->d_inode;
848 struct btrfs_root *root = BTRFS_I(inode)->root;
849 struct page *pinned[2];
850 struct page **pages = NULL;
851 struct iov_iter i;
852 loff_t *ppos = &iocb->ki_pos;
822 loff_t start_pos; 853 loff_t start_pos;
823 ssize_t num_written = 0; 854 ssize_t num_written = 0;
824 ssize_t err = 0; 855 ssize_t err = 0;
856 size_t count;
857 size_t ocount;
825 int ret = 0; 858 int ret = 0;
826 struct inode *inode = fdentry(file)->d_inode;
827 struct btrfs_root *root = BTRFS_I(inode)->root;
828 struct page **pages = NULL;
829 int nrptrs; 859 int nrptrs;
830 struct page *pinned[2];
831 unsigned long first_index; 860 unsigned long first_index;
832 unsigned long last_index; 861 unsigned long last_index;
833 int will_write; 862 int will_write;
863 int buffered = 0;
864 int copied = 0;
865 int dirty_pages = 0;
834 866
835 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 867 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
836 (file->f_flags & O_DIRECT)); 868 (file->f_flags & O_DIRECT));
837 869
838 nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
839 PAGE_CACHE_SIZE / (sizeof(struct page *)));
840 pinned[0] = NULL; 870 pinned[0] = NULL;
841 pinned[1] = NULL; 871 pinned[1] = NULL;
842 872
843 pos = *ppos;
844 start_pos = pos; 873 start_pos = pos;
845 874
846 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 875 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
847 876
848 /* do the reserve before the mutex lock in case we have to do some
849 * flushing. We wouldn't deadlock, but this is more polite.
850 */
851 err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
852 if (err)
853 goto out_nolock;
854
855 mutex_lock(&inode->i_mutex); 877 mutex_lock(&inode->i_mutex);
856 878
879 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
880 if (err)
881 goto out;
882 count = ocount;
883
857 current->backing_dev_info = inode->i_mapping->backing_dev_info; 884 current->backing_dev_info = inode->i_mapping->backing_dev_info;
858 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 885 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
859 if (err) 886 if (err)
@@ -866,16 +893,65 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
866 if (err) 893 if (err)
867 goto out; 894 goto out;
868 895
896 /*
897 * If BTRFS flips readonly due to some impossible error
898 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
899 * although we have opened a file as writable, we have
900 * to stop this write operation to ensure FS consistency.
901 */
902 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
903 err = -EROFS;
904 goto out;
905 }
906
869 file_update_time(file); 907 file_update_time(file);
908 BTRFS_I(inode)->sequence++;
909
910 if (unlikely(file->f_flags & O_DIRECT)) {
911 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
912 pos, ppos, count,
913 ocount);
914 /*
915 * the generic O_DIRECT will update in-memory i_size after the
916 * DIOs are done. But our endio handlers that update the on
917 * disk i_size never update past the in memory i_size. So we
918 * need one more update here to catch any additions to the
919 * file
920 */
921 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
922 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
923 mark_inode_dirty(inode);
924 }
925
926 if (num_written < 0) {
927 ret = num_written;
928 num_written = 0;
929 goto out;
930 } else if (num_written == count) {
931 /* pick up pos changes done by the generic code */
932 pos = *ppos;
933 goto out;
934 }
935 /*
936 * We are going to do buffered for the rest of the range, so we
937 * need to make sure to invalidate the buffered pages when we're
938 * done.
939 */
940 buffered = 1;
941 pos += num_written;
942 }
870 943
944 iov_iter_init(&i, iov, nr_segs, count, num_written);
945 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
946 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
947 (sizeof(struct page *)));
871 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); 948 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
872 949
873 /* generic_write_checks can change our pos */ 950 /* generic_write_checks can change our pos */
874 start_pos = pos; 951 start_pos = pos;
875 952
876 BTRFS_I(inode)->sequence++;
877 first_index = pos >> PAGE_CACHE_SHIFT; 953 first_index = pos >> PAGE_CACHE_SHIFT;
878 last_index = (pos + count) >> PAGE_CACHE_SHIFT; 954 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
879 955
880 /* 956 /*
881 * there are lots of better ways to do this, but this code 957 * there are lots of better ways to do this, but this code
@@ -892,7 +968,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
892 unlock_page(pinned[0]); 968 unlock_page(pinned[0]);
893 } 969 }
894 } 970 }
895 if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { 971 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
896 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 972 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
897 if (!PageUptodate(pinned[1])) { 973 if (!PageUptodate(pinned[1])) {
898 ret = btrfs_readpage(NULL, pinned[1]); 974 ret = btrfs_readpage(NULL, pinned[1]);
@@ -903,10 +979,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
903 } 979 }
904 } 980 }
905 981
906 while (count > 0) { 982 while (iov_iter_count(&i) > 0) {
907 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 983 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
908 size_t write_bytes = min(count, nrptrs * 984 size_t write_bytes = min(iov_iter_count(&i),
909 (size_t)PAGE_CACHE_SIZE - 985 nrptrs * (size_t)PAGE_CACHE_SIZE -
910 offset); 986 offset);
911 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 987 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
912 PAGE_CACHE_SHIFT; 988 PAGE_CACHE_SHIFT;
@@ -914,7 +990,17 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
914 WARN_ON(num_pages > nrptrs); 990 WARN_ON(num_pages > nrptrs);
915 memset(pages, 0, sizeof(struct page *) * nrptrs); 991 memset(pages, 0, sizeof(struct page *) * nrptrs);
916 992
917 ret = btrfs_check_data_free_space(root, inode, write_bytes); 993 /*
994 * Fault pages before locking them in prepare_pages
995 * to avoid recursive lock
996 */
997 if (unlikely(iov_iter_fault_in_readable(&i, write_bytes))) {
998 ret = -EFAULT;
999 goto out;
1000 }
1001
1002 ret = btrfs_delalloc_reserve_space(inode,
1003 num_pages << PAGE_CACHE_SHIFT);
918 if (ret) 1004 if (ret)
919 goto out; 1005 goto out;
920 1006
@@ -922,45 +1008,49 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
922 pos, first_index, last_index, 1008 pos, first_index, last_index,
923 write_bytes); 1009 write_bytes);
924 if (ret) { 1010 if (ret) {
925 btrfs_free_reserved_data_space(root, inode, 1011 btrfs_delalloc_release_space(inode,
926 write_bytes); 1012 num_pages << PAGE_CACHE_SHIFT);
927 goto out; 1013 goto out;
928 } 1014 }
929 1015
930 ret = btrfs_copy_from_user(pos, num_pages, 1016 copied = btrfs_copy_from_user(pos, num_pages,
931 write_bytes, pages, buf); 1017 write_bytes, pages, &i);
932 if (ret) { 1018 dirty_pages = (copied + PAGE_CACHE_SIZE - 1) >>
933 btrfs_free_reserved_data_space(root, inode, 1019 PAGE_CACHE_SHIFT;
934 write_bytes); 1020
935 btrfs_drop_pages(pages, num_pages); 1021 if (num_pages > dirty_pages) {
936 goto out; 1022 if (copied > 0)
1023 atomic_inc(
1024 &BTRFS_I(inode)->outstanding_extents);
1025 btrfs_delalloc_release_space(inode,
1026 (num_pages - dirty_pages) <<
1027 PAGE_CACHE_SHIFT);
937 } 1028 }
938 1029
939 ret = dirty_and_release_pages(NULL, root, file, pages, 1030 if (copied > 0) {
940 num_pages, pos, write_bytes); 1031 dirty_and_release_pages(NULL, root, file, pages,
941 btrfs_drop_pages(pages, num_pages); 1032 dirty_pages, pos, copied);
942 if (ret) {
943 btrfs_free_reserved_data_space(root, inode,
944 write_bytes);
945 goto out;
946 } 1033 }
947 1034
948 if (will_write) { 1035 btrfs_drop_pages(pages, num_pages);
949 filemap_fdatawrite_range(inode->i_mapping, pos, 1036
950 pos + write_bytes - 1); 1037 if (copied > 0) {
951 } else { 1038 if (will_write) {
952 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1039 filemap_fdatawrite_range(inode->i_mapping, pos,
953 num_pages); 1040 pos + copied - 1);
954 if (num_pages < 1041 } else {
955 (root->leafsize >> PAGE_CACHE_SHIFT) + 1) 1042 balance_dirty_pages_ratelimited_nr(
956 btrfs_btree_balance_dirty(root, 1); 1043 inode->i_mapping,
957 btrfs_throttle(root); 1044 dirty_pages);
1045 if (dirty_pages <
1046 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1047 btrfs_btree_balance_dirty(root, 1);
1048 btrfs_throttle(root);
1049 }
958 } 1050 }
959 1051
960 buf += write_bytes; 1052 pos += copied;
961 count -= write_bytes; 1053 num_written += copied;
962 pos += write_bytes;
963 num_written += write_bytes;
964 1054
965 cond_resched(); 1055 cond_resched();
966 } 1056 }
@@ -968,9 +1058,7 @@ out:
968 mutex_unlock(&inode->i_mutex); 1058 mutex_unlock(&inode->i_mutex);
969 if (ret) 1059 if (ret)
970 err = ret; 1060 err = ret;
971 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
972 1061
973out_nolock:
974 kfree(pages); 1062 kfree(pages);
975 if (pinned[0]) 1063 if (pinned[0])
976 page_cache_release(pinned[0]); 1064 page_cache_release(pinned[0]);
@@ -1000,9 +1088,15 @@ out_nolock:
1000 num_written = err; 1088 num_written = err;
1001 1089
1002 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) { 1090 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1003 trans = btrfs_start_transaction(root, 1); 1091 trans = btrfs_start_transaction(root, 0);
1092 if (IS_ERR(trans)) {
1093 num_written = PTR_ERR(trans);
1094 goto done;
1095 }
1096 mutex_lock(&inode->i_mutex);
1004 ret = btrfs_log_dentry_safe(trans, root, 1097 ret = btrfs_log_dentry_safe(trans, root,
1005 file->f_dentry); 1098 file->f_dentry);
1099 mutex_unlock(&inode->i_mutex);
1006 if (ret == 0) { 1100 if (ret == 0) {
1007 ret = btrfs_sync_log(trans, root); 1101 ret = btrfs_sync_log(trans, root);
1008 if (ret == 0) 1102 if (ret == 0)
@@ -1015,12 +1109,13 @@ out_nolock:
1015 btrfs_end_transaction(trans, root); 1109 btrfs_end_transaction(trans, root);
1016 } 1110 }
1017 } 1111 }
1018 if (file->f_flags & O_DIRECT) { 1112 if (file->f_flags & O_DIRECT && buffered) {
1019 invalidate_mapping_pages(inode->i_mapping, 1113 invalidate_mapping_pages(inode->i_mapping,
1020 start_pos >> PAGE_CACHE_SHIFT, 1114 start_pos >> PAGE_CACHE_SHIFT,
1021 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); 1115 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1022 } 1116 }
1023 } 1117 }
1118done:
1024 current->backing_dev_info = NULL; 1119 current->backing_dev_info = NULL;
1025 return num_written ? num_written : err; 1120 return num_written ? num_written : err;
1026} 1121}
@@ -1055,8 +1150,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
1055 * important optimization for directories because holding the mutex prevents 1150 * important optimization for directories because holding the mutex prevents
1056 * new operations on the dir while we write to disk. 1151 * new operations on the dir while we write to disk.
1057 */ 1152 */
1058int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) 1153int btrfs_sync_file(struct file *file, int datasync)
1059{ 1154{
1155 struct dentry *dentry = file->f_path.dentry;
1060 struct inode *inode = dentry->d_inode; 1156 struct inode *inode = dentry->d_inode;
1061 struct btrfs_root *root = BTRFS_I(inode)->root; 1157 struct btrfs_root *root = BTRFS_I(inode)->root;
1062 int ret = 0; 1158 int ret = 0;
@@ -1093,12 +1189,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1093 /* 1189 /*
1094 * ok we haven't committed the transaction yet, lets do a commit 1190 * ok we haven't committed the transaction yet, lets do a commit
1095 */ 1191 */
1096 if (file && file->private_data) 1192 if (file->private_data)
1097 btrfs_ioctl_trans_end(file); 1193 btrfs_ioctl_trans_end(file);
1098 1194
1099 trans = btrfs_start_transaction(root, 1); 1195 trans = btrfs_start_transaction(root, 0);
1100 if (!trans) { 1196 if (IS_ERR(trans)) {
1101 ret = -ENOMEM; 1197 ret = PTR_ERR(trans);
1102 goto out; 1198 goto out;
1103 } 1199 }
1104 1200
@@ -1133,7 +1229,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1133 } 1229 }
1134 mutex_lock(&dentry->d_inode->i_mutex); 1230 mutex_lock(&dentry->d_inode->i_mutex);
1135out: 1231out:
1136 return ret > 0 ? EIO : ret; 1232 return ret > 0 ? -EIO : ret;
1137} 1233}
1138 1234
1139static const struct vm_operations_struct btrfs_file_vm_ops = { 1235static const struct vm_operations_struct btrfs_file_vm_ops = {
@@ -1143,21 +1239,141 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
1143 1239
1144static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) 1240static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1145{ 1241{
1146 vma->vm_ops = &btrfs_file_vm_ops; 1242 struct address_space *mapping = filp->f_mapping;
1243
1244 if (!mapping->a_ops->readpage)
1245 return -ENOEXEC;
1246
1147 file_accessed(filp); 1247 file_accessed(filp);
1248 vma->vm_ops = &btrfs_file_vm_ops;
1249 vma->vm_flags |= VM_CAN_NONLINEAR;
1250
1148 return 0; 1251 return 0;
1149} 1252}
1150 1253
1254static long btrfs_fallocate(struct file *file, int mode,
1255 loff_t offset, loff_t len)
1256{
1257 struct inode *inode = file->f_path.dentry->d_inode;
1258 struct extent_state *cached_state = NULL;
1259 u64 cur_offset;
1260 u64 last_byte;
1261 u64 alloc_start;
1262 u64 alloc_end;
1263 u64 alloc_hint = 0;
1264 u64 locked_end;
1265 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1266 struct extent_map *em;
1267 int ret;
1268
1269 alloc_start = offset & ~mask;
1270 alloc_end = (offset + len + mask) & ~mask;
1271
1272 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1273 if (mode & ~FALLOC_FL_KEEP_SIZE)
1274 return -EOPNOTSUPP;
1275
1276 /*
1277 * wait for ordered IO before we have any locks. We'll loop again
1278 * below with the locks held.
1279 */
1280 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1281
1282 mutex_lock(&inode->i_mutex);
1283 ret = inode_newsize_ok(inode, alloc_end);
1284 if (ret)
1285 goto out;
1286
1287 if (alloc_start > inode->i_size) {
1288 ret = btrfs_cont_expand(inode, alloc_start);
1289 if (ret)
1290 goto out;
1291 }
1292
1293 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1294 if (ret)
1295 goto out;
1296
1297 locked_end = alloc_end - 1;
1298 while (1) {
1299 struct btrfs_ordered_extent *ordered;
1300
1301 /* the extent lock is ordered inside the running
1302 * transaction
1303 */
1304 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1305 locked_end, 0, &cached_state, GFP_NOFS);
1306 ordered = btrfs_lookup_first_ordered_extent(inode,
1307 alloc_end - 1);
1308 if (ordered &&
1309 ordered->file_offset + ordered->len > alloc_start &&
1310 ordered->file_offset < alloc_end) {
1311 btrfs_put_ordered_extent(ordered);
1312 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1313 alloc_start, locked_end,
1314 &cached_state, GFP_NOFS);
1315 /*
1316 * we can't wait on the range with the transaction
1317 * running or with the extent lock held
1318 */
1319 btrfs_wait_ordered_range(inode, alloc_start,
1320 alloc_end - alloc_start);
1321 } else {
1322 if (ordered)
1323 btrfs_put_ordered_extent(ordered);
1324 break;
1325 }
1326 }
1327
1328 cur_offset = alloc_start;
1329 while (1) {
1330 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1331 alloc_end - cur_offset, 0);
1332 BUG_ON(IS_ERR(em) || !em);
1333 last_byte = min(extent_map_end(em), alloc_end);
1334 last_byte = (last_byte + mask) & ~mask;
1335 if (em->block_start == EXTENT_MAP_HOLE ||
1336 (cur_offset >= inode->i_size &&
1337 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1338 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1339 last_byte - cur_offset,
1340 1 << inode->i_blkbits,
1341 offset + len,
1342 &alloc_hint);
1343 if (ret < 0) {
1344 free_extent_map(em);
1345 break;
1346 }
1347 }
1348 free_extent_map(em);
1349
1350 cur_offset = last_byte;
1351 if (cur_offset >= alloc_end) {
1352 ret = 0;
1353 break;
1354 }
1355 }
1356 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1357 &cached_state, GFP_NOFS);
1358
1359 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1360out:
1361 mutex_unlock(&inode->i_mutex);
1362 return ret;
1363}
1364
1151const struct file_operations btrfs_file_operations = { 1365const struct file_operations btrfs_file_operations = {
1152 .llseek = generic_file_llseek, 1366 .llseek = generic_file_llseek,
1153 .read = do_sync_read, 1367 .read = do_sync_read,
1368 .write = do_sync_write,
1154 .aio_read = generic_file_aio_read, 1369 .aio_read = generic_file_aio_read,
1155 .splice_read = generic_file_splice_read, 1370 .splice_read = generic_file_splice_read,
1156 .write = btrfs_file_write, 1371 .aio_write = btrfs_file_aio_write,
1157 .mmap = btrfs_file_mmap, 1372 .mmap = btrfs_file_mmap,
1158 .open = generic_file_open, 1373 .open = generic_file_open,
1159 .release = btrfs_release_file, 1374 .release = btrfs_release_file,
1160 .fsync = btrfs_sync_file, 1375 .fsync = btrfs_sync_file,
1376 .fallocate = btrfs_fallocate,
1161 .unlocked_ioctl = btrfs_ioctl, 1377 .unlocked_ioctl = btrfs_ioctl,
1162#ifdef CONFIG_COMPAT 1378#ifdef CONFIG_COMPAT
1163 .compat_ioctl = btrfs_ioctl, 1379 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cb2849f03251..60d684266959 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -18,14 +18,768 @@
18 18
19#include <linux/pagemap.h> 19#include <linux/pagemap.h>
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h>
21#include <linux/math64.h> 22#include <linux/math64.h>
22#include "ctree.h" 23#include "ctree.h"
23#include "free-space-cache.h" 24#include "free-space-cache.h"
24#include "transaction.h" 25#include "transaction.h"
26#include "disk-io.h"
25 27
26#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 28#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
27#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 29#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
28 30
31static void recalculate_thresholds(struct btrfs_block_group_cache
32 *block_group);
33static int link_free_space(struct btrfs_block_group_cache *block_group,
34 struct btrfs_free_space *info);
35
36struct inode *lookup_free_space_inode(struct btrfs_root *root,
37 struct btrfs_block_group_cache
38 *block_group, struct btrfs_path *path)
39{
40 struct btrfs_key key;
41 struct btrfs_key location;
42 struct btrfs_disk_key disk_key;
43 struct btrfs_free_space_header *header;
44 struct extent_buffer *leaf;
45 struct inode *inode = NULL;
46 int ret;
47
48 spin_lock(&block_group->lock);
49 if (block_group->inode)
50 inode = igrab(block_group->inode);
51 spin_unlock(&block_group->lock);
52 if (inode)
53 return inode;
54
55 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
56 key.offset = block_group->key.objectid;
57 key.type = 0;
58
59 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
60 if (ret < 0)
61 return ERR_PTR(ret);
62 if (ret > 0) {
63 btrfs_release_path(root, path);
64 return ERR_PTR(-ENOENT);
65 }
66
67 leaf = path->nodes[0];
68 header = btrfs_item_ptr(leaf, path->slots[0],
69 struct btrfs_free_space_header);
70 btrfs_free_space_key(leaf, header, &disk_key);
71 btrfs_disk_key_to_cpu(&location, &disk_key);
72 btrfs_release_path(root, path);
73
74 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
75 if (!inode)
76 return ERR_PTR(-ENOENT);
77 if (IS_ERR(inode))
78 return inode;
79 if (is_bad_inode(inode)) {
80 iput(inode);
81 return ERR_PTR(-ENOENT);
82 }
83
84 spin_lock(&block_group->lock);
85 if (!root->fs_info->closing) {
86 block_group->inode = igrab(inode);
87 block_group->iref = 1;
88 }
89 spin_unlock(&block_group->lock);
90
91 return inode;
92}
93
94int create_free_space_inode(struct btrfs_root *root,
95 struct btrfs_trans_handle *trans,
96 struct btrfs_block_group_cache *block_group,
97 struct btrfs_path *path)
98{
99 struct btrfs_key key;
100 struct btrfs_disk_key disk_key;
101 struct btrfs_free_space_header *header;
102 struct btrfs_inode_item *inode_item;
103 struct extent_buffer *leaf;
104 u64 objectid;
105 int ret;
106
107 ret = btrfs_find_free_objectid(trans, root, 0, &objectid);
108 if (ret < 0)
109 return ret;
110
111 ret = btrfs_insert_empty_inode(trans, root, path, objectid);
112 if (ret)
113 return ret;
114
115 leaf = path->nodes[0];
116 inode_item = btrfs_item_ptr(leaf, path->slots[0],
117 struct btrfs_inode_item);
118 btrfs_item_key(leaf, &disk_key, path->slots[0]);
119 memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
120 sizeof(*inode_item));
121 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
122 btrfs_set_inode_size(leaf, inode_item, 0);
123 btrfs_set_inode_nbytes(leaf, inode_item, 0);
124 btrfs_set_inode_uid(leaf, inode_item, 0);
125 btrfs_set_inode_gid(leaf, inode_item, 0);
126 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
127 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
128 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
129 btrfs_set_inode_nlink(leaf, inode_item, 1);
130 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
131 btrfs_set_inode_block_group(leaf, inode_item,
132 block_group->key.objectid);
133 btrfs_mark_buffer_dirty(leaf);
134 btrfs_release_path(root, path);
135
136 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
137 key.offset = block_group->key.objectid;
138 key.type = 0;
139
140 ret = btrfs_insert_empty_item(trans, root, path, &key,
141 sizeof(struct btrfs_free_space_header));
142 if (ret < 0) {
143 btrfs_release_path(root, path);
144 return ret;
145 }
146 leaf = path->nodes[0];
147 header = btrfs_item_ptr(leaf, path->slots[0],
148 struct btrfs_free_space_header);
149 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
150 btrfs_set_free_space_key(leaf, header, &disk_key);
151 btrfs_mark_buffer_dirty(leaf);
152 btrfs_release_path(root, path);
153
154 return 0;
155}
156
157int btrfs_truncate_free_space_cache(struct btrfs_root *root,
158 struct btrfs_trans_handle *trans,
159 struct btrfs_path *path,
160 struct inode *inode)
161{
162 loff_t oldsize;
163 int ret = 0;
164
165 trans->block_rsv = root->orphan_block_rsv;
166 ret = btrfs_block_rsv_check(trans, root,
167 root->orphan_block_rsv,
168 0, 5);
169 if (ret)
170 return ret;
171
172 oldsize = i_size_read(inode);
173 btrfs_i_size_write(inode, 0);
174 truncate_pagecache(inode, oldsize, 0);
175
176 /*
177 * We don't need an orphan item because truncating the free space cache
178 * will never be split across transactions.
179 */
180 ret = btrfs_truncate_inode_items(trans, root, inode,
181 0, BTRFS_EXTENT_DATA_KEY);
182 if (ret) {
183 WARN_ON(1);
184 return ret;
185 }
186
187 return btrfs_update_inode(trans, root, inode);
188}
189
190static int readahead_cache(struct inode *inode)
191{
192 struct file_ra_state *ra;
193 unsigned long last_index;
194
195 ra = kzalloc(sizeof(*ra), GFP_NOFS);
196 if (!ra)
197 return -ENOMEM;
198
199 file_ra_state_init(ra, inode->i_mapping);
200 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
201
202 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
203
204 kfree(ra);
205
206 return 0;
207}
208
209int load_free_space_cache(struct btrfs_fs_info *fs_info,
210 struct btrfs_block_group_cache *block_group)
211{
212 struct btrfs_root *root = fs_info->tree_root;
213 struct inode *inode;
214 struct btrfs_free_space_header *header;
215 struct extent_buffer *leaf;
216 struct page *page;
217 struct btrfs_path *path;
218 u32 *checksums = NULL, *crc;
219 char *disk_crcs = NULL;
220 struct btrfs_key key;
221 struct list_head bitmaps;
222 u64 num_entries;
223 u64 num_bitmaps;
224 u64 generation;
225 u32 cur_crc = ~(u32)0;
226 pgoff_t index = 0;
227 unsigned long first_page_offset;
228 int num_checksums;
229 int ret = 0;
230
231 /*
232 * If we're unmounting then just return, since this does a search on the
233 * normal root and not the commit root and we could deadlock.
234 */
235 smp_mb();
236 if (fs_info->closing)
237 return 0;
238
239 /*
240 * If this block group has been marked to be cleared for one reason or
241 * another then we can't trust the on disk cache, so just return.
242 */
243 spin_lock(&block_group->lock);
244 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
245 spin_unlock(&block_group->lock);
246 return 0;
247 }
248 spin_unlock(&block_group->lock);
249
250 INIT_LIST_HEAD(&bitmaps);
251
252 path = btrfs_alloc_path();
253 if (!path)
254 return 0;
255
256 inode = lookup_free_space_inode(root, block_group, path);
257 if (IS_ERR(inode)) {
258 btrfs_free_path(path);
259 return 0;
260 }
261
262 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) {
264 btrfs_free_path(path);
265 goto out;
266 }
267
268 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
269 key.offset = block_group->key.objectid;
270 key.type = 0;
271
272 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
273 if (ret) {
274 btrfs_free_path(path);
275 goto out;
276 }
277
278 leaf = path->nodes[0];
279 header = btrfs_item_ptr(leaf, path->slots[0],
280 struct btrfs_free_space_header);
281 num_entries = btrfs_free_space_entries(leaf, header);
282 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
283 generation = btrfs_free_space_generation(leaf, header);
284 btrfs_free_path(path);
285
286 if (BTRFS_I(inode)->generation != generation) {
287 printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
288 " not match free space cache generation (%llu) for "
289 "block group %llu\n",
290 (unsigned long long)BTRFS_I(inode)->generation,
291 (unsigned long long)generation,
292 (unsigned long long)block_group->key.objectid);
293 goto free_cache;
294 }
295
296 if (!num_entries)
297 goto out;
298
299 /* Setup everything for doing checksumming */
300 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
301 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
302 if (!checksums)
303 goto out;
304 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
305 disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
306 if (!disk_crcs)
307 goto out;
308
309 ret = readahead_cache(inode);
310 if (ret) {
311 ret = 0;
312 goto out;
313 }
314
315 while (1) {
316 struct btrfs_free_space_entry *entry;
317 struct btrfs_free_space *e;
318 void *addr;
319 unsigned long offset = 0;
320 unsigned long start_offset = 0;
321 int need_loop = 0;
322
323 if (!num_entries && !num_bitmaps)
324 break;
325
326 if (index == 0) {
327 start_offset = first_page_offset;
328 offset = start_offset;
329 }
330
331 page = grab_cache_page(inode->i_mapping, index);
332 if (!page) {
333 ret = 0;
334 goto free_cache;
335 }
336
337 if (!PageUptodate(page)) {
338 btrfs_readpage(NULL, page);
339 lock_page(page);
340 if (!PageUptodate(page)) {
341 unlock_page(page);
342 page_cache_release(page);
343 printk(KERN_ERR "btrfs: error reading free "
344 "space cache: %llu\n",
345 (unsigned long long)
346 block_group->key.objectid);
347 goto free_cache;
348 }
349 }
350 addr = kmap(page);
351
352 if (index == 0) {
353 u64 *gen;
354
355 memcpy(disk_crcs, addr, first_page_offset);
356 gen = addr + (sizeof(u32) * num_checksums);
357 if (*gen != BTRFS_I(inode)->generation) {
358 printk(KERN_ERR "btrfs: space cache generation"
359 " (%llu) does not match inode (%llu) "
360 "for block group %llu\n",
361 (unsigned long long)*gen,
362 (unsigned long long)
363 BTRFS_I(inode)->generation,
364 (unsigned long long)
365 block_group->key.objectid);
366 kunmap(page);
367 unlock_page(page);
368 page_cache_release(page);
369 goto free_cache;
370 }
371 crc = (u32 *)disk_crcs;
372 }
373 entry = addr + start_offset;
374
375 /* First lets check our crc before we do anything fun */
376 cur_crc = ~(u32)0;
377 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
378 PAGE_CACHE_SIZE - start_offset);
379 btrfs_csum_final(cur_crc, (char *)&cur_crc);
380 if (cur_crc != *crc) {
381 printk(KERN_ERR "btrfs: crc mismatch for page %lu in "
382 "block group %llu\n", index,
383 (unsigned long long)block_group->key.objectid);
384 kunmap(page);
385 unlock_page(page);
386 page_cache_release(page);
387 goto free_cache;
388 }
389 crc++;
390
391 while (1) {
392 if (!num_entries)
393 break;
394
395 need_loop = 1;
396 e = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
397 if (!e) {
398 kunmap(page);
399 unlock_page(page);
400 page_cache_release(page);
401 goto free_cache;
402 }
403
404 e->offset = le64_to_cpu(entry->offset);
405 e->bytes = le64_to_cpu(entry->bytes);
406 if (!e->bytes) {
407 kunmap(page);
408 kfree(e);
409 unlock_page(page);
410 page_cache_release(page);
411 goto free_cache;
412 }
413
414 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
415 spin_lock(&block_group->tree_lock);
416 ret = link_free_space(block_group, e);
417 spin_unlock(&block_group->tree_lock);
418 BUG_ON(ret);
419 } else {
420 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
421 if (!e->bitmap) {
422 kunmap(page);
423 kfree(e);
424 unlock_page(page);
425 page_cache_release(page);
426 goto free_cache;
427 }
428 spin_lock(&block_group->tree_lock);
429 ret = link_free_space(block_group, e);
430 block_group->total_bitmaps++;
431 recalculate_thresholds(block_group);
432 spin_unlock(&block_group->tree_lock);
433 list_add_tail(&e->list, &bitmaps);
434 }
435
436 num_entries--;
437 offset += sizeof(struct btrfs_free_space_entry);
438 if (offset + sizeof(struct btrfs_free_space_entry) >=
439 PAGE_CACHE_SIZE)
440 break;
441 entry++;
442 }
443
444 /*
445 * We read an entry out of this page, we need to move on to the
446 * next page.
447 */
448 if (need_loop) {
449 kunmap(page);
450 goto next;
451 }
452
453 /*
454 * We add the bitmaps at the end of the entries in order that
455 * the bitmap entries are added to the cache.
456 */
457 e = list_entry(bitmaps.next, struct btrfs_free_space, list);
458 list_del_init(&e->list);
459 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
460 kunmap(page);
461 num_bitmaps--;
462next:
463 unlock_page(page);
464 page_cache_release(page);
465 index++;
466 }
467
468 ret = 1;
469out:
470 kfree(checksums);
471 kfree(disk_crcs);
472 iput(inode);
473 return ret;
474
475free_cache:
476 /* This cache is bogus, make sure it gets cleared */
477 spin_lock(&block_group->lock);
478 block_group->disk_cache_state = BTRFS_DC_CLEAR;
479 spin_unlock(&block_group->lock);
480 btrfs_remove_free_space_cache(block_group);
481 goto out;
482}
483
484int btrfs_write_out_cache(struct btrfs_root *root,
485 struct btrfs_trans_handle *trans,
486 struct btrfs_block_group_cache *block_group,
487 struct btrfs_path *path)
488{
489 struct btrfs_free_space_header *header;
490 struct extent_buffer *leaf;
491 struct inode *inode;
492 struct rb_node *node;
493 struct list_head *pos, *n;
494 struct page *page;
495 struct extent_state *cached_state = NULL;
496 struct list_head bitmap_list;
497 struct btrfs_key key;
498 u64 bytes = 0;
499 u32 *crc, *checksums;
500 pgoff_t index = 0, last_index = 0;
501 unsigned long first_page_offset;
502 int num_checksums;
503 int entries = 0;
504 int bitmaps = 0;
505 int ret = 0;
506
507 root = root->fs_info->tree_root;
508
509 INIT_LIST_HEAD(&bitmap_list);
510
511 spin_lock(&block_group->lock);
512 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
513 spin_unlock(&block_group->lock);
514 return 0;
515 }
516 spin_unlock(&block_group->lock);
517
518 inode = lookup_free_space_inode(root, block_group, path);
519 if (IS_ERR(inode))
520 return 0;
521
522 if (!i_size_read(inode)) {
523 iput(inode);
524 return 0;
525 }
526
527 node = rb_first(&block_group->free_space_offset);
528 if (!node) {
529 iput(inode);
530 return 0;
531 }
532
533 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
534 filemap_write_and_wait(inode->i_mapping);
535 btrfs_wait_ordered_range(inode, inode->i_size &
536 ~(root->sectorsize - 1), (u64)-1);
537
538 /* We need a checksum per page. */
539 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
540 crc = checksums = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
541 if (!crc) {
542 iput(inode);
543 return 0;
544 }
545
546 /* Since the first page has all of our checksums and our generation we
547 * need to calculate the offset into the page that we can start writing
548 * our entries.
549 */
550 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
551
552 /*
553 * Lock all pages first so we can lock the extent safely.
554 *
555 * NOTE: Because we hold the ref the entire time we're going to write to
556 * the page find_get_page should never fail, so we don't do a check
557 * after find_get_page at this point. Just putting this here so people
558 * know and don't freak out.
559 */
560 while (index <= last_index) {
561 page = grab_cache_page(inode->i_mapping, index);
562 if (!page) {
563 pgoff_t i = 0;
564
565 while (i < index) {
566 page = find_get_page(inode->i_mapping, i);
567 unlock_page(page);
568 page_cache_release(page);
569 page_cache_release(page);
570 i++;
571 }
572 goto out_free;
573 }
574 index++;
575 }
576
577 index = 0;
578 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
579 0, &cached_state, GFP_NOFS);
580
581 /* Write out the extent entries */
582 do {
583 struct btrfs_free_space_entry *entry;
584 void *addr;
585 unsigned long offset = 0;
586 unsigned long start_offset = 0;
587
588 if (index == 0) {
589 start_offset = first_page_offset;
590 offset = start_offset;
591 }
592
593 page = find_get_page(inode->i_mapping, index);
594
595 addr = kmap(page);
596 entry = addr + start_offset;
597
598 memset(addr, 0, PAGE_CACHE_SIZE);
599 while (1) {
600 struct btrfs_free_space *e;
601
602 e = rb_entry(node, struct btrfs_free_space, offset_index);
603 entries++;
604
605 entry->offset = cpu_to_le64(e->offset);
606 entry->bytes = cpu_to_le64(e->bytes);
607 if (e->bitmap) {
608 entry->type = BTRFS_FREE_SPACE_BITMAP;
609 list_add_tail(&e->list, &bitmap_list);
610 bitmaps++;
611 } else {
612 entry->type = BTRFS_FREE_SPACE_EXTENT;
613 }
614 node = rb_next(node);
615 if (!node)
616 break;
617 offset += sizeof(struct btrfs_free_space_entry);
618 if (offset + sizeof(struct btrfs_free_space_entry) >=
619 PAGE_CACHE_SIZE)
620 break;
621 entry++;
622 }
623 *crc = ~(u32)0;
624 *crc = btrfs_csum_data(root, addr + start_offset, *crc,
625 PAGE_CACHE_SIZE - start_offset);
626 kunmap(page);
627
628 btrfs_csum_final(*crc, (char *)crc);
629 crc++;
630
631 bytes += PAGE_CACHE_SIZE;
632
633 ClearPageChecked(page);
634 set_page_extent_mapped(page);
635 SetPageUptodate(page);
636 set_page_dirty(page);
637
638 /*
639 * We need to release our reference we got for grab_cache_page,
640 * except for the first page which will hold our checksums, we
641 * do that below.
642 */
643 if (index != 0) {
644 unlock_page(page);
645 page_cache_release(page);
646 }
647
648 page_cache_release(page);
649
650 index++;
651 } while (node);
652
653 /* Write out the bitmaps */
654 list_for_each_safe(pos, n, &bitmap_list) {
655 void *addr;
656 struct btrfs_free_space *entry =
657 list_entry(pos, struct btrfs_free_space, list);
658
659 page = find_get_page(inode->i_mapping, index);
660
661 addr = kmap(page);
662 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
663 *crc = ~(u32)0;
664 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
665 kunmap(page);
666 btrfs_csum_final(*crc, (char *)crc);
667 crc++;
668 bytes += PAGE_CACHE_SIZE;
669
670 ClearPageChecked(page);
671 set_page_extent_mapped(page);
672 SetPageUptodate(page);
673 set_page_dirty(page);
674 unlock_page(page);
675 page_cache_release(page);
676 page_cache_release(page);
677 list_del_init(&entry->list);
678 index++;
679 }
680
681 /* Zero out the rest of the pages just to make sure */
682 while (index <= last_index) {
683 void *addr;
684
685 page = find_get_page(inode->i_mapping, index);
686
687 addr = kmap(page);
688 memset(addr, 0, PAGE_CACHE_SIZE);
689 kunmap(page);
690 ClearPageChecked(page);
691 set_page_extent_mapped(page);
692 SetPageUptodate(page);
693 set_page_dirty(page);
694 unlock_page(page);
695 page_cache_release(page);
696 page_cache_release(page);
697 bytes += PAGE_CACHE_SIZE;
698 index++;
699 }
700
701 btrfs_set_extent_delalloc(inode, 0, bytes - 1, &cached_state);
702
703 /* Write the checksums and trans id to the first page */
704 {
705 void *addr;
706 u64 *gen;
707
708 page = find_get_page(inode->i_mapping, 0);
709
710 addr = kmap(page);
711 memcpy(addr, checksums, sizeof(u32) * num_checksums);
712 gen = addr + (sizeof(u32) * num_checksums);
713 *gen = trans->transid;
714 kunmap(page);
715 ClearPageChecked(page);
716 set_page_extent_mapped(page);
717 SetPageUptodate(page);
718 set_page_dirty(page);
719 unlock_page(page);
720 page_cache_release(page);
721 page_cache_release(page);
722 }
723 BTRFS_I(inode)->generation = trans->transid;
724
725 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
726 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
727
728 filemap_write_and_wait(inode->i_mapping);
729
730 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
731 key.offset = block_group->key.objectid;
732 key.type = 0;
733
734 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
735 if (ret < 0) {
736 ret = 0;
737 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
738 EXTENT_DIRTY | EXTENT_DELALLOC |
739 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
740 goto out_free;
741 }
742 leaf = path->nodes[0];
743 if (ret > 0) {
744 struct btrfs_key found_key;
745 BUG_ON(!path->slots[0]);
746 path->slots[0]--;
747 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
748 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
749 found_key.offset != block_group->key.objectid) {
750 ret = 0;
751 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
752 EXTENT_DIRTY | EXTENT_DELALLOC |
753 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
754 GFP_NOFS);
755 btrfs_release_path(root, path);
756 goto out_free;
757 }
758 }
759 header = btrfs_item_ptr(leaf, path->slots[0],
760 struct btrfs_free_space_header);
761 btrfs_set_free_space_entries(leaf, header, entries);
762 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
763 btrfs_set_free_space_generation(leaf, header, trans->transid);
764 btrfs_mark_buffer_dirty(leaf);
765 btrfs_release_path(root, path);
766
767 ret = 1;
768
769out_free:
770 if (ret == 0) {
771 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
772 spin_lock(&block_group->lock);
773 block_group->disk_cache_state = BTRFS_DC_ERROR;
774 spin_unlock(&block_group->lock);
775 BTRFS_I(inode)->generation = 0;
776 }
777 kfree(checksums);
778 btrfs_update_inode(trans, root, inode);
779 iput(inode);
780 return ret;
781}
782
29static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, 783static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
30 u64 offset) 784 u64 offset)
31{ 785{
@@ -870,7 +1624,7 @@ __btrfs_return_cluster_to_free_space(
870 tree_insert_offset(&block_group->free_space_offset, 1624 tree_insert_offset(&block_group->free_space_offset,
871 entry->offset, &entry->offset_index, 0); 1625 entry->offset, &entry->offset_index, 0);
872 } 1626 }
873 cluster->root.rb_node = NULL; 1627 cluster->root = RB_ROOT;
874 1628
875out: 1629out:
876 spin_unlock(&cluster->lock); 1630 spin_unlock(&cluster->lock);
@@ -1355,7 +2109,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1355{ 2109{
1356 spin_lock_init(&cluster->lock); 2110 spin_lock_init(&cluster->lock);
1357 spin_lock_init(&cluster->refill_lock); 2111 spin_lock_init(&cluster->refill_lock);
1358 cluster->root.rb_node = NULL; 2112 cluster->root = RB_ROOT;
1359 cluster->max_size = 0; 2113 cluster->max_size = 0;
1360 cluster->points_to_bitmap = false; 2114 cluster->points_to_bitmap = false;
1361 INIT_LIST_HEAD(&cluster->block_group_list); 2115 INIT_LIST_HEAD(&cluster->block_group_list);
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011b..e49ca5c321b5 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,6 +27,24 @@ struct btrfs_free_space {
27 struct list_head list; 27 struct list_head list;
28}; 28};
29 29
30struct inode *lookup_free_space_inode(struct btrfs_root *root,
31 struct btrfs_block_group_cache
32 *block_group, struct btrfs_path *path);
33int create_free_space_inode(struct btrfs_root *root,
34 struct btrfs_trans_handle *trans,
35 struct btrfs_block_group_cache *block_group,
36 struct btrfs_path *path);
37
38int btrfs_truncate_free_space_cache(struct btrfs_root *root,
39 struct btrfs_trans_handle *trans,
40 struct btrfs_path *path,
41 struct inode *inode);
42int load_free_space_cache(struct btrfs_fs_info *fs_info,
43 struct btrfs_block_group_cache *block_group);
44int btrfs_write_out_cache(struct btrfs_root *root,
45 struct btrfs_trans_handle *trans,
46 struct btrfs_block_group_cache *block_group,
47 struct btrfs_path *path);
30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 48int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
31 u64 bytenr, u64 size); 49 u64 bytenr, u64 size);
32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 50int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
49 return 0; 49 return 0;
50} 50}
51 51
52struct btrfs_inode_ref *
53btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
54 struct btrfs_root *root,
55 struct btrfs_path *path,
56 const char *name, int name_len,
57 u64 inode_objectid, u64 ref_objectid, int mod)
58{
59 struct btrfs_key key;
60 struct btrfs_inode_ref *ref;
61 int ins_len = mod < 0 ? -1 : 0;
62 int cow = mod != 0;
63 int ret;
64
65 key.objectid = inode_objectid;
66 key.type = BTRFS_INODE_REF_KEY;
67 key.offset = ref_objectid;
68
69 ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
70 if (ret < 0)
71 return ERR_PTR(ret);
72 if (ret > 0)
73 return NULL;
74 if (!find_name_in_backref(path, name, name_len, &ref))
75 return NULL;
76 return ref;
77}
78
52int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, 79int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
53 struct btrfs_root *root, 80 struct btrfs_root *root,
54 const char *name, int name_len, 81 const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8cd109972fa6..160b55b3e132 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -36,6 +36,7 @@
36#include <linux/xattr.h> 36#include <linux/xattr.h>
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h>
39#include "compat.h" 40#include "compat.h"
40#include "ctree.h" 41#include "ctree.h"
41#include "disk-io.h" 42#include "disk-io.h"
@@ -121,10 +122,10 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
121 size_t cur_size = size; 122 size_t cur_size = size;
122 size_t datasize; 123 size_t datasize;
123 unsigned long offset; 124 unsigned long offset;
124 int use_compress = 0; 125 int compress_type = BTRFS_COMPRESS_NONE;
125 126
126 if (compressed_size && compressed_pages) { 127 if (compressed_size && compressed_pages) {
127 use_compress = 1; 128 compress_type = root->fs_info->compress_type;
128 cur_size = compressed_size; 129 cur_size = compressed_size;
129 } 130 }
130 131
@@ -158,7 +159,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
158 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 159 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
159 ptr = btrfs_file_extent_inline_start(ei); 160 ptr = btrfs_file_extent_inline_start(ei);
160 161
161 if (use_compress) { 162 if (compress_type != BTRFS_COMPRESS_NONE) {
162 struct page *cpage; 163 struct page *cpage;
163 int i = 0; 164 int i = 0;
164 while (compressed_size > 0) { 165 while (compressed_size > 0) {
@@ -175,7 +176,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
175 compressed_size -= cur_size; 176 compressed_size -= cur_size;
176 } 177 }
177 btrfs_set_file_extent_compression(leaf, ei, 178 btrfs_set_file_extent_compression(leaf, ei,
178 BTRFS_COMPRESS_ZLIB); 179 compress_type);
179 } else { 180 } else {
180 page = find_get_page(inode->i_mapping, 181 page = find_get_page(inode->i_mapping,
181 start >> PAGE_CACHE_SHIFT); 182 start >> PAGE_CACHE_SHIFT);
@@ -251,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
251 inline_len, compressed_size, 252 inline_len, compressed_size,
252 compressed_pages); 253 compressed_pages);
253 BUG_ON(ret); 254 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start);
254 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
255 return 0; 257 return 0;
256} 258}
@@ -261,6 +263,7 @@ struct async_extent {
261 u64 compressed_size; 263 u64 compressed_size;
262 struct page **pages; 264 struct page **pages;
263 unsigned long nr_pages; 265 unsigned long nr_pages;
266 int compress_type;
264 struct list_head list; 267 struct list_head list;
265}; 268};
266 269
@@ -278,7 +281,8 @@ static noinline int add_async_extent(struct async_cow *cow,
278 u64 start, u64 ram_size, 281 u64 start, u64 ram_size,
279 u64 compressed_size, 282 u64 compressed_size,
280 struct page **pages, 283 struct page **pages,
281 unsigned long nr_pages) 284 unsigned long nr_pages,
285 int compress_type)
282{ 286{
283 struct async_extent *async_extent; 287 struct async_extent *async_extent;
284 288
@@ -288,6 +292,7 @@ static noinline int add_async_extent(struct async_cow *cow,
288 async_extent->compressed_size = compressed_size; 292 async_extent->compressed_size = compressed_size;
289 async_extent->pages = pages; 293 async_extent->pages = pages;
290 async_extent->nr_pages = nr_pages; 294 async_extent->nr_pages = nr_pages;
295 async_extent->compress_type = compress_type;
291 list_add_tail(&async_extent->list, &cow->extents); 296 list_add_tail(&async_extent->list, &cow->extents);
292 return 0; 297 return 0;
293} 298}
@@ -317,8 +322,6 @@ static noinline int compress_file_range(struct inode *inode,
317 struct btrfs_root *root = BTRFS_I(inode)->root; 322 struct btrfs_root *root = BTRFS_I(inode)->root;
318 struct btrfs_trans_handle *trans; 323 struct btrfs_trans_handle *trans;
319 u64 num_bytes; 324 u64 num_bytes;
320 u64 orig_start;
321 u64 disk_num_bytes;
322 u64 blocksize = root->sectorsize; 325 u64 blocksize = root->sectorsize;
323 u64 actual_end; 326 u64 actual_end;
324 u64 isize = i_size_read(inode); 327 u64 isize = i_size_read(inode);
@@ -332,8 +335,7 @@ static noinline int compress_file_range(struct inode *inode,
332 unsigned long max_uncompressed = 128 * 1024; 335 unsigned long max_uncompressed = 128 * 1024;
333 int i; 336 int i;
334 int will_compress; 337 int will_compress;
335 338 int compress_type = root->fs_info->compress_type;
336 orig_start = start;
337 339
338 actual_end = min_t(u64, isize, end + 1); 340 actual_end = min_t(u64, isize, end + 1);
339again: 341again:
@@ -369,7 +371,6 @@ again:
369 total_compressed = min(total_compressed, max_uncompressed); 371 total_compressed = min(total_compressed, max_uncompressed);
370 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 372 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
371 num_bytes = max(blocksize, num_bytes); 373 num_bytes = max(blocksize, num_bytes);
372 disk_num_bytes = num_bytes;
373 total_in = 0; 374 total_in = 0;
374 ret = 0; 375 ret = 0;
375 376
@@ -379,16 +380,21 @@ again:
379 * change at any time if we discover bad compression ratios. 380 * change at any time if we discover bad compression ratios.
380 */ 381 */
381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 382 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
382 btrfs_test_opt(root, COMPRESS)) { 383 (btrfs_test_opt(root, COMPRESS) ||
384 (BTRFS_I(inode)->force_compress))) {
383 WARN_ON(pages); 385 WARN_ON(pages);
384 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 386 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
385 387
386 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 388 if (BTRFS_I(inode)->force_compress)
387 total_compressed, pages, 389 compress_type = BTRFS_I(inode)->force_compress;
388 nr_pages, &nr_pages_ret, 390
389 &total_in, 391 ret = btrfs_compress_pages(compress_type,
390 &total_compressed, 392 inode->i_mapping, start,
391 max_compressed); 393 total_compressed, pages,
394 nr_pages, &nr_pages_ret,
395 &total_in,
396 &total_compressed,
397 max_compressed);
392 398
393 if (!ret) { 399 if (!ret) {
394 unsigned long offset = total_compressed & 400 unsigned long offset = total_compressed &
@@ -412,6 +418,7 @@ again:
412 trans = btrfs_join_transaction(root, 1); 418 trans = btrfs_join_transaction(root, 1);
413 BUG_ON(!trans); 419 BUG_ON(!trans);
414 btrfs_set_trans_block_group(trans, inode); 420 btrfs_set_trans_block_group(trans, inode);
421 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
415 422
416 /* lets try to make an inline extent */ 423 /* lets try to make an inline extent */
417 if (ret || total_in < (actual_end - start)) { 424 if (ret || total_in < (actual_end - start)) {
@@ -437,7 +444,6 @@ again:
437 start, end, NULL, 444 start, end, NULL,
438 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY | 445 EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
439 EXTENT_CLEAR_DELALLOC | 446 EXTENT_CLEAR_DELALLOC |
440 EXTENT_CLEAR_ACCOUNTING |
441 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); 447 EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
442 448
443 btrfs_end_transaction(trans, root); 449 btrfs_end_transaction(trans, root);
@@ -464,7 +470,6 @@ again:
464 if (total_compressed >= total_in) { 470 if (total_compressed >= total_in) {
465 will_compress = 0; 471 will_compress = 0;
466 } else { 472 } else {
467 disk_num_bytes = total_compressed;
468 num_bytes = total_in; 473 num_bytes = total_in;
469 } 474 }
470 } 475 }
@@ -483,8 +488,10 @@ again:
483 nr_pages_ret = 0; 488 nr_pages_ret = 0;
484 489
485 /* flag the file so we don't compress in the future */ 490 /* flag the file so we don't compress in the future */
486 if (!btrfs_test_opt(root, FORCE_COMPRESS)) 491 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
492 !(BTRFS_I(inode)->force_compress)) {
487 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 493 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
494 }
488 } 495 }
489 if (will_compress) { 496 if (will_compress) {
490 *num_added += 1; 497 *num_added += 1;
@@ -494,9 +501,10 @@ again:
494 * and will submit them to the elevator. 501 * and will submit them to the elevator.
495 */ 502 */
496 add_async_extent(async_cow, start, num_bytes, 503 add_async_extent(async_cow, start, num_bytes,
497 total_compressed, pages, nr_pages_ret); 504 total_compressed, pages, nr_pages_ret,
505 compress_type);
498 506
499 if (start + num_bytes < end && start + num_bytes < actual_end) { 507 if (start + num_bytes < end) {
500 start += num_bytes; 508 start += num_bytes;
501 pages = NULL; 509 pages = NULL;
502 cond_resched(); 510 cond_resched();
@@ -516,7 +524,8 @@ cleanup_and_bail_uncompressed:
516 __set_page_dirty_nobuffers(locked_page); 524 __set_page_dirty_nobuffers(locked_page);
517 /* unlocked later on in the async handlers */ 525 /* unlocked later on in the async handlers */
518 } 526 }
519 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 527 add_async_extent(async_cow, start, end - start + 1,
528 0, NULL, 0, BTRFS_COMPRESS_NONE);
520 *num_added += 1; 529 *num_added += 1;
521 } 530 }
522 531
@@ -570,8 +579,8 @@ retry:
570 unsigned long nr_written = 0; 579 unsigned long nr_written = 0;
571 580
572 lock_extent(io_tree, async_extent->start, 581 lock_extent(io_tree, async_extent->start,
573 async_extent->start + 582 async_extent->start +
574 async_extent->ram_size - 1, GFP_NOFS); 583 async_extent->ram_size - 1, GFP_NOFS);
575 584
576 /* allocate blocks */ 585 /* allocate blocks */
577 ret = cow_file_range(inode, async_cow->locked_page, 586 ret = cow_file_range(inode, async_cow->locked_page,
@@ -641,6 +650,7 @@ retry:
641 em->block_start = ins.objectid; 650 em->block_start = ins.objectid;
642 em->block_len = ins.offset; 651 em->block_len = ins.offset;
643 em->bdev = root->fs_info->fs_devices->latest_bdev; 652 em->bdev = root->fs_info->fs_devices->latest_bdev;
653 em->compress_type = async_extent->compress_type;
644 set_bit(EXTENT_FLAG_PINNED, &em->flags); 654 set_bit(EXTENT_FLAG_PINNED, &em->flags);
645 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 655 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
646 656
@@ -657,11 +667,13 @@ retry:
657 async_extent->ram_size - 1, 0); 667 async_extent->ram_size - 1, 0);
658 } 668 }
659 669
660 ret = btrfs_add_ordered_extent(inode, async_extent->start, 670 ret = btrfs_add_ordered_extent_compress(inode,
661 ins.objectid, 671 async_extent->start,
662 async_extent->ram_size, 672 ins.objectid,
663 ins.offset, 673 async_extent->ram_size,
664 BTRFS_ORDERED_COMPRESSED); 674 ins.offset,
675 BTRFS_ORDERED_COMPRESSED,
676 async_extent->compress_type);
665 BUG_ON(ret); 677 BUG_ON(ret);
666 678
667 /* 679 /*
@@ -693,6 +705,38 @@ retry:
693 return 0; 705 return 0;
694} 706}
695 707
708static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
709 u64 num_bytes)
710{
711 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
712 struct extent_map *em;
713 u64 alloc_hint = 0;
714
715 read_lock(&em_tree->lock);
716 em = search_extent_mapping(em_tree, start, num_bytes);
717 if (em) {
718 /*
719 * if block start isn't an actual block number then find the
720 * first block in this inode and use that as a hint. If that
721 * block is also bogus then just don't worry about it.
722 */
723 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
724 free_extent_map(em);
725 em = search_extent_mapping(em_tree, 0, 0);
726 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
727 alloc_hint = em->block_start;
728 if (em)
729 free_extent_map(em);
730 } else {
731 alloc_hint = em->block_start;
732 free_extent_map(em);
733 }
734 }
735 read_unlock(&em_tree->lock);
736
737 return alloc_hint;
738}
739
696/* 740/*
697 * when extent_io.c finds a delayed allocation range in the file, 741 * when extent_io.c finds a delayed allocation range in the file,
698 * the call backs end up in this code. The basic idea is to 742 * the call backs end up in this code. The basic idea is to
@@ -720,18 +764,16 @@ static noinline int cow_file_range(struct inode *inode,
720 u64 disk_num_bytes; 764 u64 disk_num_bytes;
721 u64 cur_alloc_size; 765 u64 cur_alloc_size;
722 u64 blocksize = root->sectorsize; 766 u64 blocksize = root->sectorsize;
723 u64 actual_end;
724 u64 isize = i_size_read(inode);
725 struct btrfs_key ins; 767 struct btrfs_key ins;
726 struct extent_map *em; 768 struct extent_map *em;
727 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 769 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
728 int ret = 0; 770 int ret = 0;
729 771
772 BUG_ON(root == root->fs_info->tree_root);
730 trans = btrfs_join_transaction(root, 1); 773 trans = btrfs_join_transaction(root, 1);
731 BUG_ON(!trans); 774 BUG_ON(!trans);
732 btrfs_set_trans_block_group(trans, inode); 775 btrfs_set_trans_block_group(trans, inode);
733 776 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
734 actual_end = min_t(u64, isize, end + 1);
735 777
736 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 778 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
737 num_bytes = max(blocksize, num_bytes); 779 num_bytes = max(blocksize, num_bytes);
@@ -749,7 +791,6 @@ static noinline int cow_file_range(struct inode *inode,
749 EXTENT_CLEAR_UNLOCK_PAGE | 791 EXTENT_CLEAR_UNLOCK_PAGE |
750 EXTENT_CLEAR_UNLOCK | 792 EXTENT_CLEAR_UNLOCK |
751 EXTENT_CLEAR_DELALLOC | 793 EXTENT_CLEAR_DELALLOC |
752 EXTENT_CLEAR_ACCOUNTING |
753 EXTENT_CLEAR_DIRTY | 794 EXTENT_CLEAR_DIRTY |
754 EXTENT_SET_WRITEBACK | 795 EXTENT_SET_WRITEBACK |
755 EXTENT_END_WRITEBACK); 796 EXTENT_END_WRITEBACK);
@@ -765,35 +806,13 @@ static noinline int cow_file_range(struct inode *inode,
765 BUG_ON(disk_num_bytes > 806 BUG_ON(disk_num_bytes >
766 btrfs_super_total_bytes(&root->fs_info->super_copy)); 807 btrfs_super_total_bytes(&root->fs_info->super_copy));
767 808
768 809 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
769 read_lock(&BTRFS_I(inode)->extent_tree.lock);
770 em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
771 start, num_bytes);
772 if (em) {
773 /*
774 * if block start isn't an actual block number then find the
775 * first block in this inode and use that as a hint. If that
776 * block is also bogus then just don't worry about it.
777 */
778 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
779 free_extent_map(em);
780 em = search_extent_mapping(em_tree, 0, 0);
781 if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
782 alloc_hint = em->block_start;
783 if (em)
784 free_extent_map(em);
785 } else {
786 alloc_hint = em->block_start;
787 free_extent_map(em);
788 }
789 }
790 read_unlock(&BTRFS_I(inode)->extent_tree.lock);
791 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 810 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
792 811
793 while (disk_num_bytes > 0) { 812 while (disk_num_bytes > 0) {
794 unsigned long op; 813 unsigned long op;
795 814
796 cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); 815 cur_alloc_size = disk_num_bytes;
797 ret = btrfs_reserve_extent(trans, root, cur_alloc_size, 816 ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
798 root->sectorsize, 0, alloc_hint, 817 root->sectorsize, 0, alloc_hint,
799 (u64)-1, &ins, 1); 818 (u64)-1, &ins, 1);
@@ -1020,10 +1039,16 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1020 int type; 1039 int type;
1021 int nocow; 1040 int nocow;
1022 int check_prev = 1; 1041 int check_prev = 1;
1042 bool nolock = false;
1023 1043
1024 path = btrfs_alloc_path(); 1044 path = btrfs_alloc_path();
1025 BUG_ON(!path); 1045 BUG_ON(!path);
1026 trans = btrfs_join_transaction(root, 1); 1046 if (root == root->fs_info->tree_root) {
1047 nolock = true;
1048 trans = btrfs_join_transaction_nolock(root, 1);
1049 } else {
1050 trans = btrfs_join_transaction(root, 1);
1051 }
1027 BUG_ON(!trans); 1052 BUG_ON(!trans);
1028 1053
1029 cow_start = (u64)-1; 1054 cow_start = (u64)-1;
@@ -1170,6 +1195,13 @@ out_check:
1170 num_bytes, num_bytes, type); 1195 num_bytes, num_bytes, type);
1171 BUG_ON(ret); 1196 BUG_ON(ret);
1172 1197
1198 if (root->root_key.objectid ==
1199 BTRFS_DATA_RELOC_TREE_OBJECTID) {
1200 ret = btrfs_reloc_clone_csums(inode, cur_offset,
1201 num_bytes);
1202 BUG_ON(ret);
1203 }
1204
1173 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, 1205 extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
1174 cur_offset, cur_offset + num_bytes - 1, 1206 cur_offset, cur_offset + num_bytes - 1,
1175 locked_page, EXTENT_CLEAR_UNLOCK_PAGE | 1207 locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1189,8 +1221,13 @@ out_check:
1189 BUG_ON(ret); 1221 BUG_ON(ret);
1190 } 1222 }
1191 1223
1192 ret = btrfs_end_transaction(trans, root); 1224 if (nolock) {
1193 BUG_ON(ret); 1225 ret = btrfs_end_transaction_nolock(trans, root);
1226 BUG_ON(ret);
1227 } else {
1228 ret = btrfs_end_transaction(trans, root);
1229 BUG_ON(ret);
1230 }
1194 btrfs_free_path(path); 1231 btrfs_free_path(path);
1195 return 0; 1232 return 0;
1196} 1233}
@@ -1211,7 +1248,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1211 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1248 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1212 ret = run_delalloc_nocow(inode, locked_page, start, end, 1249 ret = run_delalloc_nocow(inode, locked_page, start, end,
1213 page_started, 0, nr_written); 1250 page_started, 0, nr_written);
1214 else if (!btrfs_test_opt(root, COMPRESS)) 1251 else if (!btrfs_test_opt(root, COMPRESS) &&
1252 !(BTRFS_I(inode)->force_compress))
1215 ret = cow_file_range(inode, locked_page, start, end, 1253 ret = cow_file_range(inode, locked_page, start, end,
1216 page_started, nr_written, 1); 1254 page_started, nr_written, 1);
1217 else 1255 else
@@ -1221,36 +1259,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1221} 1259}
1222 1260
1223static int btrfs_split_extent_hook(struct inode *inode, 1261static int btrfs_split_extent_hook(struct inode *inode,
1224 struct extent_state *orig, u64 split) 1262 struct extent_state *orig, u64 split)
1225{ 1263{
1226 struct btrfs_root *root = BTRFS_I(inode)->root; 1264 /* not delalloc, ignore it */
1227 u64 size;
1228
1229 if (!(orig->state & EXTENT_DELALLOC)) 1265 if (!(orig->state & EXTENT_DELALLOC))
1230 return 0; 1266 return 0;
1231 1267
1232 size = orig->end - orig->start + 1; 1268 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1233 if (size > root->fs_info->max_extent) {
1234 u64 num_extents;
1235 u64 new_size;
1236
1237 new_size = orig->end - split + 1;
1238 num_extents = div64_u64(size + root->fs_info->max_extent - 1,
1239 root->fs_info->max_extent);
1240
1241 /*
1242 * if we break a large extent up then leave oustanding_extents
1243 * be, since we've already accounted for the large extent.
1244 */
1245 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1246 root->fs_info->max_extent) < num_extents)
1247 return 0;
1248 }
1249
1250 spin_lock(&BTRFS_I(inode)->accounting_lock);
1251 BTRFS_I(inode)->outstanding_extents++;
1252 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1253
1254 return 0; 1269 return 0;
1255} 1270}
1256 1271
@@ -1264,42 +1279,11 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1264 struct extent_state *new, 1279 struct extent_state *new,
1265 struct extent_state *other) 1280 struct extent_state *other)
1266{ 1281{
1267 struct btrfs_root *root = BTRFS_I(inode)->root;
1268 u64 new_size, old_size;
1269 u64 num_extents;
1270
1271 /* not delalloc, ignore it */ 1282 /* not delalloc, ignore it */
1272 if (!(other->state & EXTENT_DELALLOC)) 1283 if (!(other->state & EXTENT_DELALLOC))
1273 return 0; 1284 return 0;
1274 1285
1275 old_size = other->end - other->start + 1; 1286 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1276 if (new->start < other->start)
1277 new_size = other->end - new->start + 1;
1278 else
1279 new_size = new->end - other->start + 1;
1280
1281 /* we're not bigger than the max, unreserve the space and go */
1282 if (new_size <= root->fs_info->max_extent) {
1283 spin_lock(&BTRFS_I(inode)->accounting_lock);
1284 BTRFS_I(inode)->outstanding_extents--;
1285 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1286 return 0;
1287 }
1288
1289 /*
1290 * If we grew by another max_extent, just return, we want to keep that
1291 * reserved amount.
1292 */
1293 num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
1294 root->fs_info->max_extent);
1295 if (div64_u64(new_size + root->fs_info->max_extent - 1,
1296 root->fs_info->max_extent) > num_extents)
1297 return 0;
1298
1299 spin_lock(&BTRFS_I(inode)->accounting_lock);
1300 BTRFS_I(inode)->outstanding_extents--;
1301 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1302
1303 return 0; 1287 return 0;
1304} 1288}
1305 1289
@@ -1308,8 +1292,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
1308 * bytes in this file, and to maintain the list of inodes that 1292 * bytes in this file, and to maintain the list of inodes that
1309 * have pending delalloc work to be done. 1293 * have pending delalloc work to be done.
1310 */ 1294 */
1311static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, 1295static int btrfs_set_bit_hook(struct inode *inode,
1312 unsigned long old, unsigned long bits) 1296 struct extent_state *state, int *bits)
1313{ 1297{
1314 1298
1315 /* 1299 /*
@@ -1317,17 +1301,21 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1317 * but in this case, we are only testeing for the DELALLOC 1301 * but in this case, we are only testeing for the DELALLOC
1318 * bit, which is only set or cleared with irqs on 1302 * bit, which is only set or cleared with irqs on
1319 */ 1303 */
1320 if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1304 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1321 struct btrfs_root *root = BTRFS_I(inode)->root; 1305 struct btrfs_root *root = BTRFS_I(inode)->root;
1306 u64 len = state->end + 1 - state->start;
1307 int do_list = (root->root_key.objectid !=
1308 BTRFS_ROOT_TREE_OBJECTID);
1309
1310 if (*bits & EXTENT_FIRST_DELALLOC)
1311 *bits &= ~EXTENT_FIRST_DELALLOC;
1312 else
1313 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
1322 1314
1323 spin_lock(&BTRFS_I(inode)->accounting_lock);
1324 BTRFS_I(inode)->outstanding_extents++;
1325 spin_unlock(&BTRFS_I(inode)->accounting_lock);
1326 btrfs_delalloc_reserve_space(root, inode, end - start + 1);
1327 spin_lock(&root->fs_info->delalloc_lock); 1315 spin_lock(&root->fs_info->delalloc_lock);
1328 BTRFS_I(inode)->delalloc_bytes += end - start + 1; 1316 BTRFS_I(inode)->delalloc_bytes += len;
1329 root->fs_info->delalloc_bytes += end - start + 1; 1317 root->fs_info->delalloc_bytes += len;
1330 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1318 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1331 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1319 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1332 &root->fs_info->delalloc_inodes); 1320 &root->fs_info->delalloc_inodes);
1333 } 1321 }
@@ -1340,45 +1328,36 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
1340 * extent_io.c clear_bit_hook, see set_bit_hook for why 1328 * extent_io.c clear_bit_hook, see set_bit_hook for why
1341 */ 1329 */
1342static int btrfs_clear_bit_hook(struct inode *inode, 1330static int btrfs_clear_bit_hook(struct inode *inode,
1343 struct extent_state *state, unsigned long bits) 1331 struct extent_state *state, int *bits)
1344{ 1332{
1345 /* 1333 /*
1346 * set_bit and clear bit hooks normally require _irqsave/restore 1334 * set_bit and clear bit hooks normally require _irqsave/restore
1347 * but in this case, we are only testeing for the DELALLOC 1335 * but in this case, we are only testeing for the DELALLOC
1348 * bit, which is only set or cleared with irqs on 1336 * bit, which is only set or cleared with irqs on
1349 */ 1337 */
1350 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { 1338 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1351 struct btrfs_root *root = BTRFS_I(inode)->root; 1339 struct btrfs_root *root = BTRFS_I(inode)->root;
1340 u64 len = state->end + 1 - state->start;
1341 int do_list = (root->root_key.objectid !=
1342 BTRFS_ROOT_TREE_OBJECTID);
1352 1343
1353 if (bits & EXTENT_DO_ACCOUNTING) { 1344 if (*bits & EXTENT_FIRST_DELALLOC)
1354 spin_lock(&BTRFS_I(inode)->accounting_lock); 1345 *bits &= ~EXTENT_FIRST_DELALLOC;
1355 BTRFS_I(inode)->outstanding_extents--; 1346 else if (!(*bits & EXTENT_DO_ACCOUNTING))
1356 spin_unlock(&BTRFS_I(inode)->accounting_lock); 1347 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
1357 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 1348
1358 } 1349 if (*bits & EXTENT_DO_ACCOUNTING)
1350 btrfs_delalloc_release_metadata(inode, len);
1351
1352 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1353 && do_list)
1354 btrfs_free_reserved_data_space(inode, len);
1359 1355
1360 spin_lock(&root->fs_info->delalloc_lock); 1356 spin_lock(&root->fs_info->delalloc_lock);
1361 if (state->end - state->start + 1 > 1357 root->fs_info->delalloc_bytes -= len;
1362 root->fs_info->delalloc_bytes) { 1358 BTRFS_I(inode)->delalloc_bytes -= len;
1363 printk(KERN_INFO "btrfs warning: delalloc account " 1359
1364 "%llu %llu\n", 1360 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1365 (unsigned long long)
1366 state->end - state->start + 1,
1367 (unsigned long long)
1368 root->fs_info->delalloc_bytes);
1369 btrfs_delalloc_free_space(root, inode, (u64)-1);
1370 root->fs_info->delalloc_bytes = 0;
1371 BTRFS_I(inode)->delalloc_bytes = 0;
1372 } else {
1373 btrfs_delalloc_free_space(root, inode,
1374 state->end -
1375 state->start + 1);
1376 root->fs_info->delalloc_bytes -= state->end -
1377 state->start + 1;
1378 BTRFS_I(inode)->delalloc_bytes -= state->end -
1379 state->start + 1;
1380 }
1381 if (BTRFS_I(inode)->delalloc_bytes == 0 &&
1382 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1361 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1383 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1362 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1384 } 1363 }
@@ -1413,7 +1392,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1413 1392
1414 if (map_length < length + size) 1393 if (map_length < length + size)
1415 return 1; 1394 return 1;
1416 return 0; 1395 return ret;
1417} 1396}
1418 1397
1419/* 1398/*
@@ -1426,7 +1405,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1426 */ 1405 */
1427static int __btrfs_submit_bio_start(struct inode *inode, int rw, 1406static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1428 struct bio *bio, int mirror_num, 1407 struct bio *bio, int mirror_num,
1429 unsigned long bio_flags) 1408 unsigned long bio_flags,
1409 u64 bio_offset)
1430{ 1410{
1431 struct btrfs_root *root = BTRFS_I(inode)->root; 1411 struct btrfs_root *root = BTRFS_I(inode)->root;
1432 int ret = 0; 1412 int ret = 0;
@@ -1445,7 +1425,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1445 * are inserted into the btree 1425 * are inserted into the btree
1446 */ 1426 */
1447static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, 1427static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1448 int mirror_num, unsigned long bio_flags) 1428 int mirror_num, unsigned long bio_flags,
1429 u64 bio_offset)
1449{ 1430{
1450 struct btrfs_root *root = BTRFS_I(inode)->root; 1431 struct btrfs_root *root = BTRFS_I(inode)->root;
1451 return btrfs_map_bio(root, rw, bio, mirror_num, 1); 1432 return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1456,7 +1437,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1456 * on write, or reading the csums from the tree before a read 1437 * on write, or reading the csums from the tree before a read
1457 */ 1438 */
1458static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, 1439static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1459 int mirror_num, unsigned long bio_flags) 1440 int mirror_num, unsigned long bio_flags,
1441 u64 bio_offset)
1460{ 1442{
1461 struct btrfs_root *root = BTRFS_I(inode)->root; 1443 struct btrfs_root *root = BTRFS_I(inode)->root;
1462 int ret = 0; 1444 int ret = 0;
@@ -1464,10 +1446,13 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1464 1446
1465 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1447 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1466 1448
1467 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1449 if (root == root->fs_info->tree_root)
1450 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
1451 else
1452 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1468 BUG_ON(ret); 1453 BUG_ON(ret);
1469 1454
1470 if (!(rw & (1 << BIO_RW))) { 1455 if (!(rw & REQ_WRITE)) {
1471 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1456 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1472 return btrfs_submit_compressed_read(inode, bio, 1457 return btrfs_submit_compressed_read(inode, bio,
1473 mirror_num, bio_flags); 1458 mirror_num, bio_flags);
@@ -1481,7 +1466,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1481 /* we're doing a write, do the async checksumming */ 1466 /* we're doing a write, do the async checksumming */
1482 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, 1467 return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1483 inode, rw, bio, mirror_num, 1468 inode, rw, bio, mirror_num,
1484 bio_flags, __btrfs_submit_bio_start, 1469 bio_flags, bio_offset,
1470 __btrfs_submit_bio_start,
1485 __btrfs_submit_bio_done); 1471 __btrfs_submit_bio_done);
1486 } 1472 }
1487 1473
@@ -1508,12 +1494,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1508 return 0; 1494 return 0;
1509} 1495}
1510 1496
1511int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1497int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1498 struct extent_state **cached_state)
1512{ 1499{
1513 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1500 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1514 WARN_ON(1); 1501 WARN_ON(1);
1515 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1502 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1516 GFP_NOFS); 1503 cached_state, GFP_NOFS);
1517} 1504}
1518 1505
1519/* see btrfs_writepage_start_hook for details on why this is required */ 1506/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1513,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1526{ 1513{
1527 struct btrfs_writepage_fixup *fixup; 1514 struct btrfs_writepage_fixup *fixup;
1528 struct btrfs_ordered_extent *ordered; 1515 struct btrfs_ordered_extent *ordered;
1516 struct extent_state *cached_state = NULL;
1529 struct page *page; 1517 struct page *page;
1530 struct inode *inode; 1518 struct inode *inode;
1531 u64 page_start; 1519 u64 page_start;
@@ -1544,7 +1532,8 @@ again:
1544 page_start = page_offset(page); 1532 page_start = page_offset(page);
1545 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1533 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1546 1534
1547 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1535 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1536 &cached_state, GFP_NOFS);
1548 1537
1549 /* already ordered? We're done */ 1538 /* already ordered? We're done */
1550 if (PagePrivate2(page)) 1539 if (PagePrivate2(page))
@@ -1552,17 +1541,19 @@ again:
1552 1541
1553 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1542 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1554 if (ordered) { 1543 if (ordered) {
1555 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1544 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1556 page_end, GFP_NOFS); 1545 page_end, &cached_state, GFP_NOFS);
1557 unlock_page(page); 1546 unlock_page(page);
1558 btrfs_start_ordered_extent(inode, ordered, 1); 1547 btrfs_start_ordered_extent(inode, ordered, 1);
1559 goto again; 1548 goto again;
1560 } 1549 }
1561 1550
1562 btrfs_set_extent_delalloc(inode, page_start, page_end); 1551 BUG();
1552 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1563 ClearPageChecked(page); 1553 ClearPageChecked(page);
1564out: 1554out:
1565 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1555 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1556 &cached_state, GFP_NOFS);
1566out_page: 1557out_page:
1567 unlock_page(page); 1558 unlock_page(page);
1568 page_cache_release(page); 1559 page_cache_release(page);
@@ -1681,24 +1672,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1681 * before we start the transaction. It limits the amount of btree 1672 * before we start the transaction. It limits the amount of btree
1682 * reads required while inside the transaction. 1673 * reads required while inside the transaction.
1683 */ 1674 */
1684static noinline void reada_csum(struct btrfs_root *root,
1685 struct btrfs_path *path,
1686 struct btrfs_ordered_extent *ordered_extent)
1687{
1688 struct btrfs_ordered_sum *sum;
1689 u64 bytenr;
1690
1691 sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
1692 list);
1693 bytenr = sum->sums[0].bytenr;
1694
1695 /*
1696 * we don't care about the results, the point of this search is
1697 * just to get the btree leaves into ram
1698 */
1699 btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
1700}
1701
1702/* as ordered data IO finishes, this gets called so we can finish 1675/* as ordered data IO finishes, this gets called so we can finish
1703 * an ordered extent if the range of bytes in the file it covers are 1676 * an ordered extent if the range of bytes in the file it covers are
1704 * fully written. 1677 * fully written.
@@ -1706,96 +1679,94 @@ static noinline void reada_csum(struct btrfs_root *root,
1706static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) 1679static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1707{ 1680{
1708 struct btrfs_root *root = BTRFS_I(inode)->root; 1681 struct btrfs_root *root = BTRFS_I(inode)->root;
1709 struct btrfs_trans_handle *trans; 1682 struct btrfs_trans_handle *trans = NULL;
1710 struct btrfs_ordered_extent *ordered_extent = NULL; 1683 struct btrfs_ordered_extent *ordered_extent = NULL;
1711 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1684 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1712 struct btrfs_path *path; 1685 struct extent_state *cached_state = NULL;
1713 int compressed = 0; 1686 int compress_type = 0;
1714 int ret; 1687 int ret;
1688 bool nolock = false;
1715 1689
1716 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1690 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1691 end - start + 1);
1717 if (!ret) 1692 if (!ret)
1718 return 0; 1693 return 0;
1694 BUG_ON(!ordered_extent);
1719 1695
1720 /* 1696 nolock = (root == root->fs_info->tree_root);
1721 * before we join the transaction, try to do some of our IO.
1722 * This will limit the amount of IO that we have to do with
1723 * the transaction running. We're unlikely to need to do any
1724 * IO if the file extents are new, the disk_i_size checks
1725 * covers the most common case.
1726 */
1727 if (start < BTRFS_I(inode)->disk_i_size) {
1728 path = btrfs_alloc_path();
1729 if (path) {
1730 ret = btrfs_lookup_file_extent(NULL, root, path,
1731 inode->i_ino,
1732 start, 0);
1733 ordered_extent = btrfs_lookup_ordered_extent(inode,
1734 start);
1735 if (!list_empty(&ordered_extent->list)) {
1736 btrfs_release_path(root, path);
1737 reada_csum(root, path, ordered_extent);
1738 }
1739 btrfs_free_path(path);
1740 }
1741 }
1742 1697
1743 if (!ordered_extent)
1744 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1745 BUG_ON(!ordered_extent);
1746 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1698 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1747 BUG_ON(!list_empty(&ordered_extent->list)); 1699 BUG_ON(!list_empty(&ordered_extent->list));
1748 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1700 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1749 if (!ret) { 1701 if (!ret) {
1750 trans = btrfs_join_transaction(root, 1); 1702 if (nolock)
1703 trans = btrfs_join_transaction_nolock(root, 1);
1704 else
1705 trans = btrfs_join_transaction(root, 1);
1706 BUG_ON(!trans);
1707 btrfs_set_trans_block_group(trans, inode);
1708 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1751 ret = btrfs_update_inode(trans, root, inode); 1709 ret = btrfs_update_inode(trans, root, inode);
1752 BUG_ON(ret); 1710 BUG_ON(ret);
1753 btrfs_end_transaction(trans, root);
1754 } 1711 }
1755 goto out; 1712 goto out;
1756 } 1713 }
1757 1714
1758 lock_extent(io_tree, ordered_extent->file_offset, 1715 lock_extent_bits(io_tree, ordered_extent->file_offset,
1759 ordered_extent->file_offset + ordered_extent->len - 1, 1716 ordered_extent->file_offset + ordered_extent->len - 1,
1760 GFP_NOFS); 1717 0, &cached_state, GFP_NOFS);
1761 1718
1762 trans = btrfs_join_transaction(root, 1); 1719 if (nolock)
1720 trans = btrfs_join_transaction_nolock(root, 1);
1721 else
1722 trans = btrfs_join_transaction(root, 1);
1723 btrfs_set_trans_block_group(trans, inode);
1724 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1763 1725
1764 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1726 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1765 compressed = 1; 1727 compress_type = ordered_extent->compress_type;
1766 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1728 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1767 BUG_ON(compressed); 1729 BUG_ON(compress_type);
1768 ret = btrfs_mark_extent_written(trans, inode, 1730 ret = btrfs_mark_extent_written(trans, inode,
1769 ordered_extent->file_offset, 1731 ordered_extent->file_offset,
1770 ordered_extent->file_offset + 1732 ordered_extent->file_offset +
1771 ordered_extent->len); 1733 ordered_extent->len);
1772 BUG_ON(ret); 1734 BUG_ON(ret);
1773 } else { 1735 } else {
1736 BUG_ON(root == root->fs_info->tree_root);
1774 ret = insert_reserved_file_extent(trans, inode, 1737 ret = insert_reserved_file_extent(trans, inode,
1775 ordered_extent->file_offset, 1738 ordered_extent->file_offset,
1776 ordered_extent->start, 1739 ordered_extent->start,
1777 ordered_extent->disk_len, 1740 ordered_extent->disk_len,
1778 ordered_extent->len, 1741 ordered_extent->len,
1779 ordered_extent->len, 1742 ordered_extent->len,
1780 compressed, 0, 0, 1743 compress_type, 0, 0,
1781 BTRFS_FILE_EXTENT_REG); 1744 BTRFS_FILE_EXTENT_REG);
1782 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1745 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1783 ordered_extent->file_offset, 1746 ordered_extent->file_offset,
1784 ordered_extent->len); 1747 ordered_extent->len);
1785 BUG_ON(ret); 1748 BUG_ON(ret);
1786 } 1749 }
1787 unlock_extent(io_tree, ordered_extent->file_offset, 1750 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1788 ordered_extent->file_offset + ordered_extent->len - 1, 1751 ordered_extent->file_offset +
1789 GFP_NOFS); 1752 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1753
1790 add_pending_csums(trans, inode, ordered_extent->file_offset, 1754 add_pending_csums(trans, inode, ordered_extent->file_offset,
1791 &ordered_extent->list); 1755 &ordered_extent->list);
1792 1756
1793 /* this also removes the ordered extent from the tree */
1794 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1757 btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1795 ret = btrfs_update_inode(trans, root, inode); 1758 ret = btrfs_update_inode(trans, root, inode);
1796 BUG_ON(ret); 1759 BUG_ON(ret);
1797 btrfs_end_transaction(trans, root);
1798out: 1760out:
1761 if (nolock) {
1762 if (trans)
1763 btrfs_end_transaction_nolock(trans, root);
1764 } else {
1765 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1766 if (trans)
1767 btrfs_end_transaction(trans, root);
1768 }
1769
1799 /* once for us */ 1770 /* once for us */
1800 btrfs_put_ordered_extent(ordered_extent); 1771 btrfs_put_ordered_extent(ordered_extent);
1801 /* once for the tree */ 1772 /* once for the tree */
@@ -1871,6 +1842,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1871 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1842 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1872 logical = em->block_start; 1843 logical = em->block_start;
1873 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1844 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1845 extent_set_compress_type(&failrec->bio_flags,
1846 em->compress_type);
1874 } 1847 }
1875 failrec->logical = logical; 1848 failrec->logical = logical;
1876 free_extent_map(em); 1849 free_extent_map(em);
@@ -1910,14 +1883,14 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1910 bio->bi_size = 0; 1883 bio->bi_size = 0;
1911 1884
1912 bio_add_page(bio, page, failrec->len, start - page_offset(page)); 1885 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1913 if (failed_bio->bi_rw & (1 << BIO_RW)) 1886 if (failed_bio->bi_rw & REQ_WRITE)
1914 rw = WRITE; 1887 rw = WRITE;
1915 else 1888 else
1916 rw = READ; 1889 rw = READ;
1917 1890
1918 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1891 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1919 failrec->last_mirror, 1892 failrec->last_mirror,
1920 failrec->bio_flags); 1893 failrec->bio_flags, 0);
1921 return 0; 1894 return 0;
1922} 1895}
1923 1896
@@ -2072,32 +2045,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2072} 2045}
2073 2046
2074/* 2047/*
2048 * calculate extra metadata reservation when snapshotting a subvolume
2049 * contains orphan files.
2050 */
2051void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2052 struct btrfs_pending_snapshot *pending,
2053 u64 *bytes_to_reserve)
2054{
2055 struct btrfs_root *root;
2056 struct btrfs_block_rsv *block_rsv;
2057 u64 num_bytes;
2058 int index;
2059
2060 root = pending->root;
2061 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2062 return;
2063
2064 block_rsv = root->orphan_block_rsv;
2065
2066 /* orphan block reservation for the snapshot */
2067 num_bytes = block_rsv->size;
2068
2069 /*
2070 * after the snapshot is created, COWing tree blocks may use more
2071 * space than it frees. So we should make sure there is enough
2072 * reserved space.
2073 */
2074 index = trans->transid & 0x1;
2075 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2076 num_bytes += block_rsv->size -
2077 (block_rsv->reserved + block_rsv->freed[index]);
2078 }
2079
2080 *bytes_to_reserve += num_bytes;
2081}
2082
2083void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2084 struct btrfs_pending_snapshot *pending)
2085{
2086 struct btrfs_root *root = pending->root;
2087 struct btrfs_root *snap = pending->snap;
2088 struct btrfs_block_rsv *block_rsv;
2089 u64 num_bytes;
2090 int index;
2091 int ret;
2092
2093 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2094 return;
2095
2096 /* refill source subvolume's orphan block reservation */
2097 block_rsv = root->orphan_block_rsv;
2098 index = trans->transid & 0x1;
2099 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2100 num_bytes = block_rsv->size -
2101 (block_rsv->reserved + block_rsv->freed[index]);
2102 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2103 root->orphan_block_rsv,
2104 num_bytes);
2105 BUG_ON(ret);
2106 }
2107
2108 /* setup orphan block reservation for the snapshot */
2109 block_rsv = btrfs_alloc_block_rsv(snap);
2110 BUG_ON(!block_rsv);
2111
2112 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2113 snap->orphan_block_rsv = block_rsv;
2114
2115 num_bytes = root->orphan_block_rsv->size;
2116 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2117 block_rsv, num_bytes);
2118 BUG_ON(ret);
2119
2120#if 0
2121 /* insert orphan item for the snapshot */
2122 WARN_ON(!root->orphan_item_inserted);
2123 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2124 snap->root_key.objectid);
2125 BUG_ON(ret);
2126 snap->orphan_item_inserted = 1;
2127#endif
2128}
2129
2130enum btrfs_orphan_cleanup_state {
2131 ORPHAN_CLEANUP_STARTED = 1,
2132 ORPHAN_CLEANUP_DONE = 2,
2133};
2134
2135/*
2136 * This is called in transaction commmit time. If there are no orphan
2137 * files in the subvolume, it removes orphan item and frees block_rsv
2138 * structure.
2139 */
2140void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2141 struct btrfs_root *root)
2142{
2143 int ret;
2144
2145 if (!list_empty(&root->orphan_list) ||
2146 root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2147 return;
2148
2149 if (root->orphan_item_inserted &&
2150 btrfs_root_refs(&root->root_item) > 0) {
2151 ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2152 root->root_key.objectid);
2153 BUG_ON(ret);
2154 root->orphan_item_inserted = 0;
2155 }
2156
2157 if (root->orphan_block_rsv) {
2158 WARN_ON(root->orphan_block_rsv->size > 0);
2159 btrfs_free_block_rsv(root, root->orphan_block_rsv);
2160 root->orphan_block_rsv = NULL;
2161 }
2162}
2163
2164/*
2075 * This creates an orphan entry for the given inode in case something goes 2165 * This creates an orphan entry for the given inode in case something goes
2076 * wrong in the middle of an unlink/truncate. 2166 * wrong in the middle of an unlink/truncate.
2167 *
2168 * NOTE: caller of this function should reserve 5 units of metadata for
2169 * this function.
2077 */ 2170 */
2078int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) 2171int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2079{ 2172{
2080 struct btrfs_root *root = BTRFS_I(inode)->root; 2173 struct btrfs_root *root = BTRFS_I(inode)->root;
2081 int ret = 0; 2174 struct btrfs_block_rsv *block_rsv = NULL;
2175 int reserve = 0;
2176 int insert = 0;
2177 int ret;
2082 2178
2083 spin_lock(&root->list_lock); 2179 if (!root->orphan_block_rsv) {
2180 block_rsv = btrfs_alloc_block_rsv(root);
2181 BUG_ON(!block_rsv);
2182 }
2084 2183
2085 /* already on the orphan list, we're good */ 2184 spin_lock(&root->orphan_lock);
2086 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 2185 if (!root->orphan_block_rsv) {
2087 spin_unlock(&root->list_lock); 2186 root->orphan_block_rsv = block_rsv;
2088 return 0; 2187 } else if (block_rsv) {
2188 btrfs_free_block_rsv(root, block_rsv);
2189 block_rsv = NULL;
2190 }
2191
2192 if (list_empty(&BTRFS_I(inode)->i_orphan)) {
2193 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2194#if 0
2195 /*
2196 * For proper ENOSPC handling, we should do orphan
2197 * cleanup when mounting. But this introduces backward
2198 * compatibility issue.
2199 */
2200 if (!xchg(&root->orphan_item_inserted, 1))
2201 insert = 2;
2202 else
2203 insert = 1;
2204#endif
2205 insert = 1;
2206 } else {
2207 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2089 } 2208 }
2090 2209
2091 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2210 if (!BTRFS_I(inode)->orphan_meta_reserved) {
2211 BTRFS_I(inode)->orphan_meta_reserved = 1;
2212 reserve = 1;
2213 }
2214 spin_unlock(&root->orphan_lock);
2092 2215
2093 spin_unlock(&root->list_lock); 2216 if (block_rsv)
2217 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2094 2218
2095 /* 2219 /* grab metadata reservation from transaction handle */
2096 * insert an orphan item to track this unlinked/truncated file 2220 if (reserve) {
2097 */ 2221 ret = btrfs_orphan_reserve_metadata(trans, inode);
2098 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2222 BUG_ON(ret);
2223 }
2099 2224
2100 return ret; 2225 /* insert an orphan item to track this unlinked/truncated file */
2226 if (insert >= 1) {
2227 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
2228 BUG_ON(ret);
2229 }
2230
2231 /* insert an orphan item to track subvolume contains orphan files */
2232 if (insert >= 2) {
2233 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2234 root->root_key.objectid);
2235 BUG_ON(ret);
2236 }
2237 return 0;
2101} 2238}
2102 2239
2103/* 2240/*
@@ -2107,26 +2244,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2107int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) 2244int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2108{ 2245{
2109 struct btrfs_root *root = BTRFS_I(inode)->root; 2246 struct btrfs_root *root = BTRFS_I(inode)->root;
2247 int delete_item = 0;
2248 int release_rsv = 0;
2110 int ret = 0; 2249 int ret = 0;
2111 2250
2112 spin_lock(&root->list_lock); 2251 spin_lock(&root->orphan_lock);
2113 2252 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
2114 if (list_empty(&BTRFS_I(inode)->i_orphan)) { 2253 list_del_init(&BTRFS_I(inode)->i_orphan);
2115 spin_unlock(&root->list_lock); 2254 delete_item = 1;
2116 return 0;
2117 } 2255 }
2118 2256
2119 list_del_init(&BTRFS_I(inode)->i_orphan); 2257 if (BTRFS_I(inode)->orphan_meta_reserved) {
2120 if (!trans) { 2258 BTRFS_I(inode)->orphan_meta_reserved = 0;
2121 spin_unlock(&root->list_lock); 2259 release_rsv = 1;
2122 return 0;
2123 } 2260 }
2261 spin_unlock(&root->orphan_lock);
2124 2262
2125 spin_unlock(&root->list_lock); 2263 if (trans && delete_item) {
2264 ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
2265 BUG_ON(ret);
2266 }
2126 2267
2127 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2268 if (release_rsv)
2269 btrfs_orphan_release_metadata(inode);
2128 2270
2129 return ret; 2271 return 0;
2130} 2272}
2131 2273
2132/* 2274/*
@@ -2137,13 +2279,12 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2137{ 2279{
2138 struct btrfs_path *path; 2280 struct btrfs_path *path;
2139 struct extent_buffer *leaf; 2281 struct extent_buffer *leaf;
2140 struct btrfs_item *item;
2141 struct btrfs_key key, found_key; 2282 struct btrfs_key key, found_key;
2142 struct btrfs_trans_handle *trans; 2283 struct btrfs_trans_handle *trans;
2143 struct inode *inode; 2284 struct inode *inode;
2144 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2285 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2145 2286
2146 if (!xchg(&root->clean_orphans, 0)) 2287 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2147 return; 2288 return;
2148 2289
2149 path = btrfs_alloc_path(); 2290 path = btrfs_alloc_path();
@@ -2175,7 +2316,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2175 2316
2176 /* pull out the item */ 2317 /* pull out the item */
2177 leaf = path->nodes[0]; 2318 leaf = path->nodes[0];
2178 item = btrfs_item_nr(leaf, path->slots[0]);
2179 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2319 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2180 2320
2181 /* make sure the item matches what we want */ 2321 /* make sure the item matches what we want */
@@ -2195,17 +2335,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2195 found_key.objectid = found_key.offset; 2335 found_key.objectid = found_key.offset;
2196 found_key.type = BTRFS_INODE_ITEM_KEY; 2336 found_key.type = BTRFS_INODE_ITEM_KEY;
2197 found_key.offset = 0; 2337 found_key.offset = 0;
2198 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 2338 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2199 if (IS_ERR(inode)) 2339 BUG_ON(IS_ERR(inode));
2200 break;
2201 2340
2202 /* 2341 /*
2203 * add this inode to the orphan list so btrfs_orphan_del does 2342 * add this inode to the orphan list so btrfs_orphan_del does
2204 * the proper thing when we hit it 2343 * the proper thing when we hit it
2205 */ 2344 */
2206 spin_lock(&root->list_lock); 2345 spin_lock(&root->orphan_lock);
2207 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2346 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2208 spin_unlock(&root->list_lock); 2347 spin_unlock(&root->orphan_lock);
2209 2348
2210 /* 2349 /*
2211 * if this is a bad inode, means we actually succeeded in 2350 * if this is a bad inode, means we actually succeeded in
@@ -2214,7 +2353,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2214 * do a destroy_inode 2353 * do a destroy_inode
2215 */ 2354 */
2216 if (is_bad_inode(inode)) { 2355 if (is_bad_inode(inode)) {
2217 trans = btrfs_start_transaction(root, 1); 2356 trans = btrfs_start_transaction(root, 0);
2218 btrfs_orphan_del(trans, inode); 2357 btrfs_orphan_del(trans, inode);
2219 btrfs_end_transaction(trans, root); 2358 btrfs_end_transaction(trans, root);
2220 iput(inode); 2359 iput(inode);
@@ -2232,13 +2371,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2232 /* this will do delete_inode and everything for us */ 2371 /* this will do delete_inode and everything for us */
2233 iput(inode); 2372 iput(inode);
2234 } 2373 }
2374 btrfs_free_path(path);
2375
2376 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2377
2378 if (root->orphan_block_rsv)
2379 btrfs_block_rsv_release(root, root->orphan_block_rsv,
2380 (u64)-1);
2381
2382 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2383 trans = btrfs_join_transaction(root, 1);
2384 btrfs_end_transaction(trans, root);
2385 }
2235 2386
2236 if (nr_unlink) 2387 if (nr_unlink)
2237 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2388 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2238 if (nr_truncate) 2389 if (nr_truncate)
2239 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2390 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2240
2241 btrfs_free_path(path);
2242} 2391}
2243 2392
2244/* 2393/*
@@ -2542,7 +2691,8 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2542 2691
2543 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2692 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2544 dir, index); 2693 dir, index);
2545 BUG_ON(ret); 2694 if (ret == -ENOENT)
2695 ret = 0;
2546err: 2696err:
2547 btrfs_free_path(path); 2697 btrfs_free_path(path);
2548 if (ret) 2698 if (ret)
@@ -2557,29 +2707,201 @@ out:
2557 return ret; 2707 return ret;
2558} 2708}
2559 2709
2560static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 2710/* helper to check if there is any shared block in the path */
2711static int check_path_shared(struct btrfs_root *root,
2712 struct btrfs_path *path)
2713{
2714 struct extent_buffer *eb;
2715 int level;
2716 u64 refs = 1;
2717 int uninitialized_var(ret);
2718
2719 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2720 if (!path->nodes[level])
2721 break;
2722 eb = path->nodes[level];
2723 if (!btrfs_block_can_be_shared(root, eb))
2724 continue;
2725 ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
2726 &refs, NULL);
2727 if (refs > 1)
2728 return 1;
2729 }
2730 return ret; /* XXX callers? */
2731}
2732
2733/*
2734 * helper to start transaction for unlink and rmdir.
2735 *
2736 * unlink and rmdir are special in btrfs, they do not always free space.
2737 * so in enospc case, we should make sure they will free space before
2738 * allowing them to use the global metadata reservation.
2739 */
2740static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2741 struct dentry *dentry)
2561{ 2742{
2562 struct btrfs_root *root;
2563 struct btrfs_trans_handle *trans; 2743 struct btrfs_trans_handle *trans;
2744 struct btrfs_root *root = BTRFS_I(dir)->root;
2745 struct btrfs_path *path;
2746 struct btrfs_inode_ref *ref;
2747 struct btrfs_dir_item *di;
2564 struct inode *inode = dentry->d_inode; 2748 struct inode *inode = dentry->d_inode;
2749 u64 index;
2750 int check_link = 1;
2751 int err = -ENOSPC;
2565 int ret; 2752 int ret;
2566 unsigned long nr = 0;
2567 2753
2568 root = BTRFS_I(dir)->root; 2754 trans = btrfs_start_transaction(root, 10);
2755 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2756 return trans;
2569 2757
2570 /* 2758 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2571 * 5 items for unlink inode 2759 return ERR_PTR(-ENOSPC);
2572 * 1 for orphan 2760
2573 */ 2761 /* check if there is someone else holds reference */
2574 ret = btrfs_reserve_metadata_space(root, 6); 2762 if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
2575 if (ret) 2763 return ERR_PTR(-ENOSPC);
2576 return ret; 2764
2765 if (atomic_read(&inode->i_count) > 2)
2766 return ERR_PTR(-ENOSPC);
2577 2767
2578 trans = btrfs_start_transaction(root, 1); 2768 if (xchg(&root->fs_info->enospc_unlink, 1))
2769 return ERR_PTR(-ENOSPC);
2770
2771 path = btrfs_alloc_path();
2772 if (!path) {
2773 root->fs_info->enospc_unlink = 0;
2774 return ERR_PTR(-ENOMEM);
2775 }
2776
2777 trans = btrfs_start_transaction(root, 0);
2579 if (IS_ERR(trans)) { 2778 if (IS_ERR(trans)) {
2580 btrfs_unreserve_metadata_space(root, 6); 2779 btrfs_free_path(path);
2581 return PTR_ERR(trans); 2780 root->fs_info->enospc_unlink = 0;
2781 return trans;
2782 }
2783
2784 path->skip_locking = 1;
2785 path->search_commit_root = 1;
2786
2787 ret = btrfs_lookup_inode(trans, root, path,
2788 &BTRFS_I(dir)->location, 0);
2789 if (ret < 0) {
2790 err = ret;
2791 goto out;
2792 }
2793 if (ret == 0) {
2794 if (check_path_shared(root, path))
2795 goto out;
2796 } else {
2797 check_link = 0;
2798 }
2799 btrfs_release_path(root, path);
2800
2801 ret = btrfs_lookup_inode(trans, root, path,
2802 &BTRFS_I(inode)->location, 0);
2803 if (ret < 0) {
2804 err = ret;
2805 goto out;
2806 }
2807 if (ret == 0) {
2808 if (check_path_shared(root, path))
2809 goto out;
2810 } else {
2811 check_link = 0;
2812 }
2813 btrfs_release_path(root, path);
2814
2815 if (ret == 0 && S_ISREG(inode->i_mode)) {
2816 ret = btrfs_lookup_file_extent(trans, root, path,
2817 inode->i_ino, (u64)-1, 0);
2818 if (ret < 0) {
2819 err = ret;
2820 goto out;
2821 }
2822 BUG_ON(ret == 0);
2823 if (check_path_shared(root, path))
2824 goto out;
2825 btrfs_release_path(root, path);
2826 }
2827
2828 if (!check_link) {
2829 err = 0;
2830 goto out;
2831 }
2832
2833 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
2834 dentry->d_name.name, dentry->d_name.len, 0);
2835 if (IS_ERR(di)) {
2836 err = PTR_ERR(di);
2837 goto out;
2582 } 2838 }
2839 if (di) {
2840 if (check_path_shared(root, path))
2841 goto out;
2842 } else {
2843 err = 0;
2844 goto out;
2845 }
2846 btrfs_release_path(root, path);
2847
2848 ref = btrfs_lookup_inode_ref(trans, root, path,
2849 dentry->d_name.name, dentry->d_name.len,
2850 inode->i_ino, dir->i_ino, 0);
2851 if (IS_ERR(ref)) {
2852 err = PTR_ERR(ref);
2853 goto out;
2854 }
2855 BUG_ON(!ref);
2856 if (check_path_shared(root, path))
2857 goto out;
2858 index = btrfs_inode_ref_index(path->nodes[0], ref);
2859 btrfs_release_path(root, path);
2860
2861 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
2862 dentry->d_name.name, dentry->d_name.len, 0);
2863 if (IS_ERR(di)) {
2864 err = PTR_ERR(di);
2865 goto out;
2866 }
2867 BUG_ON(ret == -ENOENT);
2868 if (check_path_shared(root, path))
2869 goto out;
2870
2871 err = 0;
2872out:
2873 btrfs_free_path(path);
2874 if (err) {
2875 btrfs_end_transaction(trans, root);
2876 root->fs_info->enospc_unlink = 0;
2877 return ERR_PTR(err);
2878 }
2879
2880 trans->block_rsv = &root->fs_info->global_block_rsv;
2881 return trans;
2882}
2883
2884static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2885 struct btrfs_root *root)
2886{
2887 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2888 BUG_ON(!root->fs_info->enospc_unlink);
2889 root->fs_info->enospc_unlink = 0;
2890 }
2891 btrfs_end_transaction_throttle(trans, root);
2892}
2893
2894static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2895{
2896 struct btrfs_root *root = BTRFS_I(dir)->root;
2897 struct btrfs_trans_handle *trans;
2898 struct inode *inode = dentry->d_inode;
2899 int ret;
2900 unsigned long nr = 0;
2901
2902 trans = __unlink_start_trans(dir, dentry);
2903 if (IS_ERR(trans))
2904 return PTR_ERR(trans);
2583 2905
2584 btrfs_set_trans_block_group(trans, dir); 2906 btrfs_set_trans_block_group(trans, dir);
2585 2907
@@ -2587,14 +2909,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2587 2909
2588 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 2910 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
2589 dentry->d_name.name, dentry->d_name.len); 2911 dentry->d_name.name, dentry->d_name.len);
2912 BUG_ON(ret);
2590 2913
2591 if (inode->i_nlink == 0) 2914 if (inode->i_nlink == 0) {
2592 ret = btrfs_orphan_add(trans, inode); 2915 ret = btrfs_orphan_add(trans, inode);
2916 BUG_ON(ret);
2917 }
2593 2918
2594 nr = trans->blocks_used; 2919 nr = trans->blocks_used;
2595 2920 __unlink_end_trans(trans, root);
2596 btrfs_end_transaction_throttle(trans, root);
2597 btrfs_unreserve_metadata_space(root, 6);
2598 btrfs_btree_balance_dirty(root, nr); 2921 btrfs_btree_balance_dirty(root, nr);
2599 return ret; 2922 return ret;
2600} 2923}
@@ -2656,7 +2979,6 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2656 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2979 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2657 ret = btrfs_update_inode(trans, root, dir); 2980 ret = btrfs_update_inode(trans, root, dir);
2658 BUG_ON(ret); 2981 BUG_ON(ret);
2659 dir->i_sb->s_dirt = 1;
2660 2982
2661 btrfs_free_path(path); 2983 btrfs_free_path(path);
2662 return 0; 2984 return 0;
@@ -2666,7 +2988,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2666{ 2988{
2667 struct inode *inode = dentry->d_inode; 2989 struct inode *inode = dentry->d_inode;
2668 int err = 0; 2990 int err = 0;
2669 int ret;
2670 struct btrfs_root *root = BTRFS_I(dir)->root; 2991 struct btrfs_root *root = BTRFS_I(dir)->root;
2671 struct btrfs_trans_handle *trans; 2992 struct btrfs_trans_handle *trans;
2672 unsigned long nr = 0; 2993 unsigned long nr = 0;
@@ -2675,15 +2996,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2675 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 2996 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
2676 return -ENOTEMPTY; 2997 return -ENOTEMPTY;
2677 2998
2678 ret = btrfs_reserve_metadata_space(root, 5); 2999 trans = __unlink_start_trans(dir, dentry);
2679 if (ret) 3000 if (IS_ERR(trans))
2680 return ret;
2681
2682 trans = btrfs_start_transaction(root, 1);
2683 if (IS_ERR(trans)) {
2684 btrfs_unreserve_metadata_space(root, 5);
2685 return PTR_ERR(trans); 3001 return PTR_ERR(trans);
2686 }
2687 3002
2688 btrfs_set_trans_block_group(trans, dir); 3003 btrfs_set_trans_block_group(trans, dir);
2689 3004
@@ -2706,12 +3021,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2706 btrfs_i_size_write(inode, 0); 3021 btrfs_i_size_write(inode, 0);
2707out: 3022out:
2708 nr = trans->blocks_used; 3023 nr = trans->blocks_used;
2709 ret = btrfs_end_transaction_throttle(trans, root); 3024 __unlink_end_trans(trans, root);
2710 btrfs_unreserve_metadata_space(root, 5);
2711 btrfs_btree_balance_dirty(root, nr); 3025 btrfs_btree_balance_dirty(root, nr);
2712 3026
2713 if (ret && !err)
2714 err = ret;
2715 return err; 3027 return err;
2716} 3028}
2717 3029
@@ -2925,7 +3237,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2925 3237
2926 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3238 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
2927 3239
2928 if (root->ref_cows) 3240 if (root->ref_cows || root == root->fs_info->tree_root)
2929 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3241 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
2930 3242
2931 path = btrfs_alloc_path(); 3243 path = btrfs_alloc_path();
@@ -3073,7 +3385,8 @@ delete:
3073 } else { 3385 } else {
3074 break; 3386 break;
3075 } 3387 }
3076 if (found_extent && root->ref_cows) { 3388 if (found_extent && (root->ref_cows ||
3389 root == root->fs_info->tree_root)) {
3077 btrfs_set_path_blocking(path); 3390 btrfs_set_path_blocking(path);
3078 ret = btrfs_free_extent(trans, root, extent_start, 3391 ret = btrfs_free_extent(trans, root, extent_start,
3079 extent_num_bytes, 0, 3392 extent_num_bytes, 0,
@@ -3108,6 +3421,7 @@ out:
3108 if (pending_del_nr) { 3421 if (pending_del_nr) {
3109 ret = btrfs_del_items(trans, root, path, pending_del_slot, 3422 ret = btrfs_del_items(trans, root, path, pending_del_slot,
3110 pending_del_nr); 3423 pending_del_nr);
3424 BUG_ON(ret);
3111 } 3425 }
3112 btrfs_free_path(path); 3426 btrfs_free_path(path);
3113 return err; 3427 return err;
@@ -3123,6 +3437,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3123 struct btrfs_root *root = BTRFS_I(inode)->root; 3437 struct btrfs_root *root = BTRFS_I(inode)->root;
3124 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3438 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3125 struct btrfs_ordered_extent *ordered; 3439 struct btrfs_ordered_extent *ordered;
3440 struct extent_state *cached_state = NULL;
3126 char *kaddr; 3441 char *kaddr;
3127 u32 blocksize = root->sectorsize; 3442 u32 blocksize = root->sectorsize;
3128 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3443 pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3134,11 +3449,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3134 3449
3135 if ((offset & (blocksize - 1)) == 0) 3450 if ((offset & (blocksize - 1)) == 0)
3136 goto out; 3451 goto out;
3137 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 3452 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
3138 if (ret)
3139 goto out;
3140
3141 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
3142 if (ret) 3453 if (ret)
3143 goto out; 3454 goto out;
3144 3455
@@ -3146,8 +3457,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3146again: 3457again:
3147 page = grab_cache_page(mapping, index); 3458 page = grab_cache_page(mapping, index);
3148 if (!page) { 3459 if (!page) {
3149 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3460 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3150 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3151 goto out; 3461 goto out;
3152 } 3462 }
3153 3463
@@ -3169,12 +3479,14 @@ again:
3169 } 3479 }
3170 wait_on_page_writeback(page); 3480 wait_on_page_writeback(page);
3171 3481
3172 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3482 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
3483 GFP_NOFS);
3173 set_page_extent_mapped(page); 3484 set_page_extent_mapped(page);
3174 3485
3175 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3486 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3176 if (ordered) { 3487 if (ordered) {
3177 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3488 unlock_extent_cached(io_tree, page_start, page_end,
3489 &cached_state, GFP_NOFS);
3178 unlock_page(page); 3490 unlock_page(page);
3179 page_cache_release(page); 3491 page_cache_release(page);
3180 btrfs_start_ordered_extent(inode, ordered, 1); 3492 btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3182,13 +3494,15 @@ again:
3182 goto again; 3494 goto again;
3183 } 3495 }
3184 3496
3185 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3497 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3186 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3498 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3187 GFP_NOFS); 3499 0, 0, &cached_state, GFP_NOFS);
3188 3500
3189 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3501 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3502 &cached_state);
3190 if (ret) { 3503 if (ret) {
3191 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3504 unlock_extent_cached(io_tree, page_start, page_end,
3505 &cached_state, GFP_NOFS);
3192 goto out_unlock; 3506 goto out_unlock;
3193 } 3507 }
3194 3508
@@ -3201,12 +3515,12 @@ again:
3201 } 3515 }
3202 ClearPageChecked(page); 3516 ClearPageChecked(page);
3203 set_page_dirty(page); 3517 set_page_dirty(page);
3204 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3518 unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3519 GFP_NOFS);
3205 3520
3206out_unlock: 3521out_unlock:
3207 if (ret) 3522 if (ret)
3208 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 3523 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3209 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
3210 unlock_page(page); 3524 unlock_page(page);
3211 page_cache_release(page); 3525 page_cache_release(page);
3212out: 3526out:
@@ -3218,7 +3532,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3218 struct btrfs_trans_handle *trans; 3532 struct btrfs_trans_handle *trans;
3219 struct btrfs_root *root = BTRFS_I(inode)->root; 3533 struct btrfs_root *root = BTRFS_I(inode)->root;
3220 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3534 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3221 struct extent_map *em; 3535 struct extent_map *em = NULL;
3536 struct extent_state *cached_state = NULL;
3222 u64 mask = root->sectorsize - 1; 3537 u64 mask = root->sectorsize - 1;
3223 u64 hole_start = (inode->i_size + mask) & ~mask; 3538 u64 hole_start = (inode->i_size + mask) & ~mask;
3224 u64 block_end = (size + mask) & ~mask; 3539 u64 block_end = (size + mask) & ~mask;
@@ -3234,11 +3549,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3234 struct btrfs_ordered_extent *ordered; 3549 struct btrfs_ordered_extent *ordered;
3235 btrfs_wait_ordered_range(inode, hole_start, 3550 btrfs_wait_ordered_range(inode, hole_start,
3236 block_end - hole_start); 3551 block_end - hole_start);
3237 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3552 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3553 &cached_state, GFP_NOFS);
3238 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3554 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3239 if (!ordered) 3555 if (!ordered)
3240 break; 3556 break;
3241 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3557 unlock_extent_cached(io_tree, hole_start, block_end - 1,
3558 &cached_state, GFP_NOFS);
3242 btrfs_put_ordered_extent(ordered); 3559 btrfs_put_ordered_extent(ordered);
3243 } 3560 }
3244 3561
@@ -3253,11 +3570,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3253 u64 hint_byte = 0; 3570 u64 hint_byte = 0;
3254 hole_size = last_byte - cur_offset; 3571 hole_size = last_byte - cur_offset;
3255 3572
3256 err = btrfs_reserve_metadata_space(root, 2); 3573 trans = btrfs_start_transaction(root, 2);
3257 if (err) 3574 if (IS_ERR(trans)) {
3575 err = PTR_ERR(trans);
3258 break; 3576 break;
3259 3577 }
3260 trans = btrfs_start_transaction(root, 1);
3261 btrfs_set_trans_block_group(trans, inode); 3578 btrfs_set_trans_block_group(trans, inode);
3262 3579
3263 err = btrfs_drop_extents(trans, inode, cur_offset, 3580 err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3275,15 +3592,17 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3275 last_byte - 1, 0); 3592 last_byte - 1, 0);
3276 3593
3277 btrfs_end_transaction(trans, root); 3594 btrfs_end_transaction(trans, root);
3278 btrfs_unreserve_metadata_space(root, 2);
3279 } 3595 }
3280 free_extent_map(em); 3596 free_extent_map(em);
3597 em = NULL;
3281 cur_offset = last_byte; 3598 cur_offset = last_byte;
3282 if (cur_offset >= block_end) 3599 if (cur_offset >= block_end)
3283 break; 3600 break;
3284 } 3601 }
3285 3602
3286 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3603 free_extent_map(em);
3604 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3605 GFP_NOFS);
3287 return err; 3606 return err;
3288} 3607}
3289 3608
@@ -3308,11 +3627,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3308 } 3627 }
3309 } 3628 }
3310 3629
3311 ret = btrfs_reserve_metadata_space(root, 1); 3630 trans = btrfs_start_transaction(root, 5);
3312 if (ret) 3631 if (IS_ERR(trans))
3313 return ret; 3632 return PTR_ERR(trans);
3314 3633
3315 trans = btrfs_start_transaction(root, 1);
3316 btrfs_set_trans_block_group(trans, inode); 3634 btrfs_set_trans_block_group(trans, inode);
3317 3635
3318 ret = btrfs_orphan_add(trans, inode); 3636 ret = btrfs_orphan_add(trans, inode);
@@ -3320,7 +3638,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3320 3638
3321 nr = trans->blocks_used; 3639 nr = trans->blocks_used;
3322 btrfs_end_transaction(trans, root); 3640 btrfs_end_transaction(trans, root);
3323 btrfs_unreserve_metadata_space(root, 1);
3324 btrfs_btree_balance_dirty(root, nr); 3641 btrfs_btree_balance_dirty(root, nr);
3325 3642
3326 if (attr->ia_size > inode->i_size) { 3643 if (attr->ia_size > inode->i_size) {
@@ -3333,8 +3650,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3333 i_size_write(inode, attr->ia_size); 3650 i_size_write(inode, attr->ia_size);
3334 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3651 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
3335 3652
3336 trans = btrfs_start_transaction(root, 1); 3653 trans = btrfs_start_transaction(root, 0);
3654 BUG_ON(IS_ERR(trans));
3337 btrfs_set_trans_block_group(trans, inode); 3655 btrfs_set_trans_block_group(trans, inode);
3656 trans->block_rsv = root->orphan_block_rsv;
3657 BUG_ON(!trans->block_rsv);
3338 3658
3339 ret = btrfs_update_inode(trans, root, inode); 3659 ret = btrfs_update_inode(trans, root, inode);
3340 BUG_ON(ret); 3660 BUG_ON(ret);
@@ -3366,8 +3686,12 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
3366static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3686static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3367{ 3687{
3368 struct inode *inode = dentry->d_inode; 3688 struct inode *inode = dentry->d_inode;
3689 struct btrfs_root *root = BTRFS_I(inode)->root;
3369 int err; 3690 int err;
3370 3691
3692 if (btrfs_root_readonly(root))
3693 return -EROFS;
3694
3371 err = inode_change_ok(inode, attr); 3695 err = inode_change_ok(inode, attr);
3372 if (err) 3696 if (err)
3373 return err; 3697 return err;
@@ -3377,17 +3701,19 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3377 if (err) 3701 if (err)
3378 return err; 3702 return err;
3379 } 3703 }
3380 attr->ia_valid &= ~ATTR_SIZE;
3381 3704
3382 if (attr->ia_valid) 3705 if (attr->ia_valid) {
3383 err = inode_setattr(inode, attr); 3706 setattr_copy(inode, attr);
3707 mark_inode_dirty(inode);
3708
3709 if (attr->ia_valid & ATTR_MODE)
3710 err = btrfs_acl_chmod(inode);
3711 }
3384 3712
3385 if (!err && ((attr->ia_valid & ATTR_MODE)))
3386 err = btrfs_acl_chmod(inode);
3387 return err; 3713 return err;
3388} 3714}
3389 3715
3390void btrfs_delete_inode(struct inode *inode) 3716void btrfs_evict_inode(struct inode *inode)
3391{ 3717{
3392 struct btrfs_trans_handle *trans; 3718 struct btrfs_trans_handle *trans;
3393 struct btrfs_root *root = BTRFS_I(inode)->root; 3719 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3395,10 +3721,15 @@ void btrfs_delete_inode(struct inode *inode)
3395 int ret; 3721 int ret;
3396 3722
3397 truncate_inode_pages(&inode->i_data, 0); 3723 truncate_inode_pages(&inode->i_data, 0);
3724 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3725 root == root->fs_info->tree_root))
3726 goto no_delete;
3727
3398 if (is_bad_inode(inode)) { 3728 if (is_bad_inode(inode)) {
3399 btrfs_orphan_del(NULL, inode); 3729 btrfs_orphan_del(NULL, inode);
3400 goto no_delete; 3730 goto no_delete;
3401 } 3731 }
3732 /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
3402 btrfs_wait_ordered_range(inode, 0, (u64)-1); 3733 btrfs_wait_ordered_range(inode, 0, (u64)-1);
3403 3734
3404 if (root->fs_info->log_root_recovering) { 3735 if (root->fs_info->log_root_recovering) {
@@ -3414,10 +3745,21 @@ void btrfs_delete_inode(struct inode *inode)
3414 btrfs_i_size_write(inode, 0); 3745 btrfs_i_size_write(inode, 0);
3415 3746
3416 while (1) { 3747 while (1) {
3417 trans = btrfs_start_transaction(root, 1); 3748 trans = btrfs_start_transaction(root, 0);
3749 BUG_ON(IS_ERR(trans));
3418 btrfs_set_trans_block_group(trans, inode); 3750 btrfs_set_trans_block_group(trans, inode);
3419 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3751 trans->block_rsv = root->orphan_block_rsv;
3752
3753 ret = btrfs_block_rsv_check(trans, root,
3754 root->orphan_block_rsv, 0, 5);
3755 if (ret) {
3756 BUG_ON(ret != -EAGAIN);
3757 ret = btrfs_commit_transaction(trans, root);
3758 BUG_ON(ret);
3759 continue;
3760 }
3420 3761
3762 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3421 if (ret != -EAGAIN) 3763 if (ret != -EAGAIN)
3422 break; 3764 break;
3423 3765
@@ -3425,6 +3767,7 @@ void btrfs_delete_inode(struct inode *inode)
3425 btrfs_end_transaction(trans, root); 3767 btrfs_end_transaction(trans, root);
3426 trans = NULL; 3768 trans = NULL;
3427 btrfs_btree_balance_dirty(root, nr); 3769 btrfs_btree_balance_dirty(root, nr);
3770
3428 } 3771 }
3429 3772
3430 if (ret == 0) { 3773 if (ret == 0) {
@@ -3436,7 +3779,7 @@ void btrfs_delete_inode(struct inode *inode)
3436 btrfs_end_transaction(trans, root); 3779 btrfs_end_transaction(trans, root);
3437 btrfs_btree_balance_dirty(root, nr); 3780 btrfs_btree_balance_dirty(root, nr);
3438no_delete: 3781no_delete:
3439 clear_inode(inode); 3782 end_writeback(inode);
3440 return; 3783 return;
3441} 3784}
3442 3785
@@ -3553,7 +3896,7 @@ again:
3553 p = &root->inode_tree.rb_node; 3896 p = &root->inode_tree.rb_node;
3554 parent = NULL; 3897 parent = NULL;
3555 3898
3556 if (hlist_unhashed(&inode->i_hash)) 3899 if (inode_unhashed(inode))
3557 return; 3900 return;
3558 3901
3559 spin_lock(&root->inode_lock); 3902 spin_lock(&root->inode_lock);
@@ -3567,7 +3910,7 @@ again:
3567 p = &parent->rb_right; 3910 p = &parent->rb_right;
3568 else { 3911 else {
3569 WARN_ON(!(entry->vfs_inode.i_state & 3912 WARN_ON(!(entry->vfs_inode.i_state &
3570 (I_WILL_FREE | I_FREEING | I_CLEAR))); 3913 (I_WILL_FREE | I_FREEING)));
3571 rb_erase(parent, &root->inode_tree); 3914 rb_erase(parent, &root->inode_tree);
3572 RB_CLEAR_NODE(parent); 3915 RB_CLEAR_NODE(parent);
3573 spin_unlock(&root->inode_lock); 3916 spin_unlock(&root->inode_lock);
@@ -3592,7 +3935,14 @@ static void inode_tree_del(struct inode *inode)
3592 } 3935 }
3593 spin_unlock(&root->inode_lock); 3936 spin_unlock(&root->inode_lock);
3594 3937
3595 if (empty && btrfs_root_refs(&root->root_item) == 0) { 3938 /*
3939 * Free space cache has inodes in the tree root, but the tree root has a
3940 * root_refs of 0, so this could end up dropping the tree root as a
3941 * snapshot, so we need the extra !root->fs_info->tree_root check to
3942 * make sure we don't drop it.
3943 */
3944 if (empty && btrfs_root_refs(&root->root_item) == 0 &&
3945 root != root->fs_info->tree_root) {
3596 synchronize_srcu(&root->fs_info->subvol_srcu); 3946 synchronize_srcu(&root->fs_info->subvol_srcu);
3597 spin_lock(&root->inode_lock); 3947 spin_lock(&root->inode_lock);
3598 empty = RB_EMPTY_ROOT(&root->inode_tree); 3948 empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -3646,7 +3996,7 @@ again:
3646 if (atomic_read(&inode->i_count) > 1) 3996 if (atomic_read(&inode->i_count) > 1)
3647 d_prune_aliases(inode); 3997 d_prune_aliases(inode);
3648 /* 3998 /*
3649 * btrfs_drop_inode will remove it from 3999 * btrfs_drop_inode will have it removed from
3650 * the inode cache when its usage count 4000 * the inode cache when its usage count
3651 * hits zero. 4001 * hits zero.
3652 */ 4002 */
@@ -3665,39 +4015,10 @@ again:
3665 return 0; 4015 return 0;
3666} 4016}
3667 4017
3668static noinline void init_btrfs_i(struct inode *inode)
3669{
3670 struct btrfs_inode *bi = BTRFS_I(inode);
3671
3672 bi->generation = 0;
3673 bi->sequence = 0;
3674 bi->last_trans = 0;
3675 bi->last_sub_trans = 0;
3676 bi->logged_trans = 0;
3677 bi->delalloc_bytes = 0;
3678 bi->reserved_bytes = 0;
3679 bi->disk_i_size = 0;
3680 bi->flags = 0;
3681 bi->index_cnt = (u64)-1;
3682 bi->last_unlink_trans = 0;
3683 bi->ordered_data_close = 0;
3684 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3685 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3686 inode->i_mapping, GFP_NOFS);
3687 extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
3688 inode->i_mapping, GFP_NOFS);
3689 INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
3690 INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
3691 RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
3692 btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
3693 mutex_init(&BTRFS_I(inode)->log_mutex);
3694}
3695
3696static int btrfs_init_locked_inode(struct inode *inode, void *p) 4018static int btrfs_init_locked_inode(struct inode *inode, void *p)
3697{ 4019{
3698 struct btrfs_iget_args *args = p; 4020 struct btrfs_iget_args *args = p;
3699 inode->i_ino = args->ino; 4021 inode->i_ino = args->ino;
3700 init_btrfs_i(inode);
3701 BTRFS_I(inode)->root = args->root; 4022 BTRFS_I(inode)->root = args->root;
3702 btrfs_set_inode_space_info(args->root, inode); 4023 btrfs_set_inode_space_info(args->root, inode);
3703 return 0; 4024 return 0;
@@ -3729,7 +4050,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
3729 * Returns in *is_new if the inode was read from disk 4050 * Returns in *is_new if the inode was read from disk
3730 */ 4051 */
3731struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 4052struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3732 struct btrfs_root *root) 4053 struct btrfs_root *root, int *new)
3733{ 4054{
3734 struct inode *inode; 4055 struct inode *inode;
3735 4056
@@ -3744,6 +4065,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3744 4065
3745 inode_tree_add(inode); 4066 inode_tree_add(inode);
3746 unlock_new_inode(inode); 4067 unlock_new_inode(inode);
4068 if (new)
4069 *new = 1;
3747 } 4070 }
3748 4071
3749 return inode; 4072 return inode;
@@ -3758,8 +4081,6 @@ static struct inode *new_simple_dir(struct super_block *s,
3758 if (!inode) 4081 if (!inode)
3759 return ERR_PTR(-ENOMEM); 4082 return ERR_PTR(-ENOMEM);
3760 4083
3761 init_btrfs_i(inode);
3762
3763 BTRFS_I(inode)->root = root; 4084 BTRFS_I(inode)->root = root;
3764 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key)); 4085 memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
3765 BTRFS_I(inode)->dummy_inode = 1; 4086 BTRFS_I(inode)->dummy_inode = 1;
@@ -3782,8 +4103,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3782 int index; 4103 int index;
3783 int ret; 4104 int ret;
3784 4105
3785 dentry->d_op = &btrfs_dentry_operations;
3786
3787 if (dentry->d_name.len > BTRFS_NAME_LEN) 4106 if (dentry->d_name.len > BTRFS_NAME_LEN)
3788 return ERR_PTR(-ENAMETOOLONG); 4107 return ERR_PTR(-ENAMETOOLONG);
3789 4108
@@ -3796,7 +4115,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3796 return NULL; 4115 return NULL;
3797 4116
3798 if (location.type == BTRFS_INODE_ITEM_KEY) { 4117 if (location.type == BTRFS_INODE_ITEM_KEY) {
3799 inode = btrfs_iget(dir->i_sb, &location, root); 4118 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
3800 return inode; 4119 return inode;
3801 } 4120 }
3802 4121
@@ -3811,7 +4130,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3811 else 4130 else
3812 inode = new_simple_dir(dir->i_sb, &location, sub_root); 4131 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3813 } else { 4132 } else {
3814 inode = btrfs_iget(dir->i_sb, &location, sub_root); 4133 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
3815 } 4134 }
3816 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4135 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3817 4136
@@ -3825,7 +4144,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3825 return inode; 4144 return inode;
3826} 4145}
3827 4146
3828static int btrfs_dentry_delete(struct dentry *dentry) 4147static int btrfs_dentry_delete(const struct dentry *dentry)
3829{ 4148{
3830 struct btrfs_root *root; 4149 struct btrfs_root *root;
3831 4150
@@ -4010,19 +4329,29 @@ err:
4010 return ret; 4329 return ret;
4011} 4330}
4012 4331
4013int btrfs_write_inode(struct inode *inode, int wait) 4332int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4014{ 4333{
4015 struct btrfs_root *root = BTRFS_I(inode)->root; 4334 struct btrfs_root *root = BTRFS_I(inode)->root;
4016 struct btrfs_trans_handle *trans; 4335 struct btrfs_trans_handle *trans;
4017 int ret = 0; 4336 int ret = 0;
4337 bool nolock = false;
4018 4338
4019 if (root->fs_info->btree_inode == inode) 4339 if (BTRFS_I(inode)->dummy_inode)
4020 return 0; 4340 return 0;
4021 4341
4022 if (wait) { 4342 smp_mb();
4023 trans = btrfs_join_transaction(root, 1); 4343 nolock = (root->fs_info->closing && root == root->fs_info->tree_root);
4344
4345 if (wbc->sync_mode == WB_SYNC_ALL) {
4346 if (nolock)
4347 trans = btrfs_join_transaction_nolock(root, 1);
4348 else
4349 trans = btrfs_join_transaction(root, 1);
4024 btrfs_set_trans_block_group(trans, inode); 4350 btrfs_set_trans_block_group(trans, inode);
4025 ret = btrfs_commit_transaction(trans, root); 4351 if (nolock)
4352 ret = btrfs_end_transaction_nolock(trans, root);
4353 else
4354 ret = btrfs_commit_transaction(trans, root);
4026 } 4355 }
4027 return ret; 4356 return ret;
4028} 4357}
@@ -4037,10 +4366,38 @@ void btrfs_dirty_inode(struct inode *inode)
4037{ 4366{
4038 struct btrfs_root *root = BTRFS_I(inode)->root; 4367 struct btrfs_root *root = BTRFS_I(inode)->root;
4039 struct btrfs_trans_handle *trans; 4368 struct btrfs_trans_handle *trans;
4369 int ret;
4370
4371 if (BTRFS_I(inode)->dummy_inode)
4372 return;
4040 4373
4041 trans = btrfs_join_transaction(root, 1); 4374 trans = btrfs_join_transaction(root, 1);
4042 btrfs_set_trans_block_group(trans, inode); 4375 btrfs_set_trans_block_group(trans, inode);
4043 btrfs_update_inode(trans, root, inode); 4376
4377 ret = btrfs_update_inode(trans, root, inode);
4378 if (ret && ret == -ENOSPC) {
4379 /* whoops, lets try again with the full transaction */
4380 btrfs_end_transaction(trans, root);
4381 trans = btrfs_start_transaction(root, 1);
4382 if (IS_ERR(trans)) {
4383 if (printk_ratelimit()) {
4384 printk(KERN_ERR "btrfs: fail to "
4385 "dirty inode %lu error %ld\n",
4386 inode->i_ino, PTR_ERR(trans));
4387 }
4388 return;
4389 }
4390 btrfs_set_trans_block_group(trans, inode);
4391
4392 ret = btrfs_update_inode(trans, root, inode);
4393 if (ret) {
4394 if (printk_ratelimit()) {
4395 printk(KERN_ERR "btrfs: fail to "
4396 "dirty inode %lu error %d\n",
4397 inode->i_ino, ret);
4398 }
4399 }
4400 }
4044 btrfs_end_transaction(trans, root); 4401 btrfs_end_transaction(trans, root);
4045} 4402}
4046 4403
@@ -4158,10 +4515,10 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4158 * btrfs_get_inode_index_count has an explanation for the magic 4515 * btrfs_get_inode_index_count has an explanation for the magic
4159 * number 4516 * number
4160 */ 4517 */
4161 init_btrfs_i(inode);
4162 BTRFS_I(inode)->index_cnt = 2; 4518 BTRFS_I(inode)->index_cnt = 2;
4163 BTRFS_I(inode)->root = root; 4519 BTRFS_I(inode)->root = root;
4164 BTRFS_I(inode)->generation = trans->transid; 4520 BTRFS_I(inode)->generation = trans->transid;
4521 inode->i_generation = BTRFS_I(inode)->generation;
4165 btrfs_set_inode_space_info(root, inode); 4522 btrfs_set_inode_space_info(root, inode);
4166 4523
4167 if (mode & S_IFDIR) 4524 if (mode & S_IFDIR)
@@ -4187,16 +4544,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4187 if (ret != 0) 4544 if (ret != 0)
4188 goto fail; 4545 goto fail;
4189 4546
4190 inode->i_uid = current_fsuid(); 4547 inode_init_owner(inode, dir, mode);
4191
4192 if (dir && (dir->i_mode & S_ISGID)) {
4193 inode->i_gid = dir->i_gid;
4194 if (S_ISDIR(mode))
4195 mode |= S_ISGID;
4196 } else
4197 inode->i_gid = current_fsgid();
4198
4199 inode->i_mode = mode;
4200 inode->i_ino = objectid; 4548 inode->i_ino = objectid;
4201 inode_set_bytes(inode, 0); 4549 inode_set_bytes(inode, 0);
4202 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4550 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -4292,12 +4640,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4292} 4640}
4293 4641
4294static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4642static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4295 struct dentry *dentry, struct inode *inode, 4643 struct inode *dir, struct dentry *dentry,
4296 int backref, u64 index) 4644 struct inode *inode, int backref, u64 index)
4297{ 4645{
4298 int err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4646 int err = btrfs_add_link(trans, dir, inode,
4299 inode, dentry->d_name.name, 4647 dentry->d_name.name, dentry->d_name.len,
4300 dentry->d_name.len, backref, index); 4648 backref, index);
4301 if (!err) { 4649 if (!err) {
4302 d_instantiate(dentry, inode); 4650 d_instantiate(dentry, inode);
4303 return 0; 4651 return 0;
@@ -4322,29 +4670,23 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4322 if (!new_valid_dev(rdev)) 4670 if (!new_valid_dev(rdev))
4323 return -EINVAL; 4671 return -EINVAL;
4324 4672
4673 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4674 if (err)
4675 return err;
4676
4325 /* 4677 /*
4326 * 2 for inode item and ref 4678 * 2 for inode item and ref
4327 * 2 for dir items 4679 * 2 for dir items
4328 * 1 for xattr if selinux is on 4680 * 1 for xattr if selinux is on
4329 */ 4681 */
4330 err = btrfs_reserve_metadata_space(root, 5); 4682 trans = btrfs_start_transaction(root, 5);
4331 if (err) 4683 if (IS_ERR(trans))
4332 return err; 4684 return PTR_ERR(trans);
4333 4685
4334 trans = btrfs_start_transaction(root, 1);
4335 if (!trans)
4336 goto fail;
4337 btrfs_set_trans_block_group(trans, dir); 4686 btrfs_set_trans_block_group(trans, dir);
4338 4687
4339 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4340 if (err) {
4341 err = -ENOSPC;
4342 goto out_unlock;
4343 }
4344
4345 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4688 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4346 dentry->d_name.len, 4689 dentry->d_name.len, dir->i_ino, objectid,
4347 dentry->d_parent->d_inode->i_ino, objectid,
4348 BTRFS_I(dir)->block_group, mode, &index); 4690 BTRFS_I(dir)->block_group, mode, &index);
4349 err = PTR_ERR(inode); 4691 err = PTR_ERR(inode);
4350 if (IS_ERR(inode)) 4692 if (IS_ERR(inode))
@@ -4357,7 +4699,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4357 } 4699 }
4358 4700
4359 btrfs_set_trans_block_group(trans, inode); 4701 btrfs_set_trans_block_group(trans, inode);
4360 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4702 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4361 if (err) 4703 if (err)
4362 drop_inode = 1; 4704 drop_inode = 1;
4363 else { 4705 else {
@@ -4370,13 +4712,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4370out_unlock: 4712out_unlock:
4371 nr = trans->blocks_used; 4713 nr = trans->blocks_used;
4372 btrfs_end_transaction_throttle(trans, root); 4714 btrfs_end_transaction_throttle(trans, root);
4373fail: 4715 btrfs_btree_balance_dirty(root, nr);
4374 btrfs_unreserve_metadata_space(root, 5);
4375 if (drop_inode) { 4716 if (drop_inode) {
4376 inode_dec_link_count(inode); 4717 inode_dec_link_count(inode);
4377 iput(inode); 4718 iput(inode);
4378 } 4719 }
4379 btrfs_btree_balance_dirty(root, nr);
4380 return err; 4720 return err;
4381} 4721}
4382 4722
@@ -4386,37 +4726,29 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4386 struct btrfs_trans_handle *trans; 4726 struct btrfs_trans_handle *trans;
4387 struct btrfs_root *root = BTRFS_I(dir)->root; 4727 struct btrfs_root *root = BTRFS_I(dir)->root;
4388 struct inode *inode = NULL; 4728 struct inode *inode = NULL;
4389 int err;
4390 int drop_inode = 0; 4729 int drop_inode = 0;
4730 int err;
4391 unsigned long nr = 0; 4731 unsigned long nr = 0;
4392 u64 objectid; 4732 u64 objectid;
4393 u64 index = 0; 4733 u64 index = 0;
4394 4734
4735 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4736 if (err)
4737 return err;
4395 /* 4738 /*
4396 * 2 for inode item and ref 4739 * 2 for inode item and ref
4397 * 2 for dir items 4740 * 2 for dir items
4398 * 1 for xattr if selinux is on 4741 * 1 for xattr if selinux is on
4399 */ 4742 */
4400 err = btrfs_reserve_metadata_space(root, 5); 4743 trans = btrfs_start_transaction(root, 5);
4401 if (err) 4744 if (IS_ERR(trans))
4402 return err; 4745 return PTR_ERR(trans);
4403 4746
4404 trans = btrfs_start_transaction(root, 1);
4405 if (!trans)
4406 goto fail;
4407 btrfs_set_trans_block_group(trans, dir); 4747 btrfs_set_trans_block_group(trans, dir);
4408 4748
4409 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4410 if (err) {
4411 err = -ENOSPC;
4412 goto out_unlock;
4413 }
4414
4415 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4749 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4416 dentry->d_name.len, 4750 dentry->d_name.len, dir->i_ino, objectid,
4417 dentry->d_parent->d_inode->i_ino, 4751 BTRFS_I(dir)->block_group, mode, &index);
4418 objectid, BTRFS_I(dir)->block_group, mode,
4419 &index);
4420 err = PTR_ERR(inode); 4752 err = PTR_ERR(inode);
4421 if (IS_ERR(inode)) 4753 if (IS_ERR(inode))
4422 goto out_unlock; 4754 goto out_unlock;
@@ -4428,7 +4760,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4428 } 4760 }
4429 4761
4430 btrfs_set_trans_block_group(trans, inode); 4762 btrfs_set_trans_block_group(trans, inode);
4431 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 4763 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4432 if (err) 4764 if (err)
4433 drop_inode = 1; 4765 drop_inode = 1;
4434 else { 4766 else {
@@ -4443,8 +4775,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4443out_unlock: 4775out_unlock:
4444 nr = trans->blocks_used; 4776 nr = trans->blocks_used;
4445 btrfs_end_transaction_throttle(trans, root); 4777 btrfs_end_transaction_throttle(trans, root);
4446fail:
4447 btrfs_unreserve_metadata_space(root, 5);
4448 if (drop_inode) { 4778 if (drop_inode) {
4449 inode_dec_link_count(inode); 4779 inode_dec_link_count(inode);
4450 iput(inode); 4780 iput(inode);
@@ -4471,40 +4801,42 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4471 if (root->objectid != BTRFS_I(inode)->root->objectid) 4801 if (root->objectid != BTRFS_I(inode)->root->objectid)
4472 return -EPERM; 4802 return -EPERM;
4473 4803
4474 /*
4475 * 1 item for inode ref
4476 * 2 items for dir items
4477 */
4478 err = btrfs_reserve_metadata_space(root, 3);
4479 if (err)
4480 return err;
4481
4482 btrfs_inc_nlink(inode); 4804 btrfs_inc_nlink(inode);
4805 inode->i_ctime = CURRENT_TIME;
4483 4806
4484 err = btrfs_set_inode_index(dir, &index); 4807 err = btrfs_set_inode_index(dir, &index);
4485 if (err) 4808 if (err)
4486 goto fail; 4809 goto fail;
4487 4810
4488 trans = btrfs_start_transaction(root, 1); 4811 /*
4812 * 1 item for inode ref
4813 * 2 items for dir items
4814 */
4815 trans = btrfs_start_transaction(root, 3);
4816 if (IS_ERR(trans)) {
4817 err = PTR_ERR(trans);
4818 goto fail;
4819 }
4489 4820
4490 btrfs_set_trans_block_group(trans, dir); 4821 btrfs_set_trans_block_group(trans, dir);
4491 atomic_inc(&inode->i_count); 4822 ihold(inode);
4492 4823
4493 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4824 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
4494 4825
4495 if (err) { 4826 if (err) {
4496 drop_inode = 1; 4827 drop_inode = 1;
4497 } else { 4828 } else {
4829 struct dentry *parent = dget_parent(dentry);
4498 btrfs_update_inode_block_group(trans, dir); 4830 btrfs_update_inode_block_group(trans, dir);
4499 err = btrfs_update_inode(trans, root, inode); 4831 err = btrfs_update_inode(trans, root, inode);
4500 BUG_ON(err); 4832 BUG_ON(err);
4501 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); 4833 btrfs_log_new_name(trans, inode, NULL, parent);
4834 dput(parent);
4502 } 4835 }
4503 4836
4504 nr = trans->blocks_used; 4837 nr = trans->blocks_used;
4505 btrfs_end_transaction_throttle(trans, root); 4838 btrfs_end_transaction_throttle(trans, root);
4506fail: 4839fail:
4507 btrfs_unreserve_metadata_space(root, 3);
4508 if (drop_inode) { 4840 if (drop_inode) {
4509 inode_dec_link_count(inode); 4841 inode_dec_link_count(inode);
4510 iput(inode); 4842 iput(inode);
@@ -4524,31 +4856,22 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4524 u64 index = 0; 4856 u64 index = 0;
4525 unsigned long nr = 1; 4857 unsigned long nr = 1;
4526 4858
4859 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4860 if (err)
4861 return err;
4862
4527 /* 4863 /*
4528 * 2 items for inode and ref 4864 * 2 items for inode and ref
4529 * 2 items for dir items 4865 * 2 items for dir items
4530 * 1 for xattr if selinux is on 4866 * 1 for xattr if selinux is on
4531 */ 4867 */
4532 err = btrfs_reserve_metadata_space(root, 5); 4868 trans = btrfs_start_transaction(root, 5);
4533 if (err) 4869 if (IS_ERR(trans))
4534 return err; 4870 return PTR_ERR(trans);
4535
4536 trans = btrfs_start_transaction(root, 1);
4537 if (!trans) {
4538 err = -ENOMEM;
4539 goto out_unlock;
4540 }
4541 btrfs_set_trans_block_group(trans, dir); 4871 btrfs_set_trans_block_group(trans, dir);
4542 4872
4543 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4544 if (err) {
4545 err = -ENOSPC;
4546 goto out_unlock;
4547 }
4548
4549 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4873 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4550 dentry->d_name.len, 4874 dentry->d_name.len, dir->i_ino, objectid,
4551 dentry->d_parent->d_inode->i_ino, objectid,
4552 BTRFS_I(dir)->block_group, S_IFDIR | mode, 4875 BTRFS_I(dir)->block_group, S_IFDIR | mode,
4553 &index); 4876 &index);
4554 if (IS_ERR(inode)) { 4877 if (IS_ERR(inode)) {
@@ -4571,9 +4894,8 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4571 if (err) 4894 if (err)
4572 goto out_fail; 4895 goto out_fail;
4573 4896
4574 err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4897 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
4575 inode, dentry->d_name.name, 4898 dentry->d_name.len, 0, index);
4576 dentry->d_name.len, 0, index);
4577 if (err) 4899 if (err)
4578 goto out_fail; 4900 goto out_fail;
4579 4901
@@ -4585,9 +4907,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4585out_fail: 4907out_fail:
4586 nr = trans->blocks_used; 4908 nr = trans->blocks_used;
4587 btrfs_end_transaction_throttle(trans, root); 4909 btrfs_end_transaction_throttle(trans, root);
4588
4589out_unlock:
4590 btrfs_unreserve_metadata_space(root, 5);
4591 if (drop_on_err) 4910 if (drop_on_err)
4592 iput(inode); 4911 iput(inode);
4593 btrfs_btree_balance_dirty(root, nr); 4912 btrfs_btree_balance_dirty(root, nr);
@@ -4628,8 +4947,10 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4628 size_t max_size; 4947 size_t max_size;
4629 unsigned long inline_size; 4948 unsigned long inline_size;
4630 unsigned long ptr; 4949 unsigned long ptr;
4950 int compress_type;
4631 4951
4632 WARN_ON(pg_offset != 0); 4952 WARN_ON(pg_offset != 0);
4953 compress_type = btrfs_file_extent_compression(leaf, item);
4633 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4954 max_size = btrfs_file_extent_ram_bytes(leaf, item);
4634 inline_size = btrfs_file_extent_inline_item_len(leaf, 4955 inline_size = btrfs_file_extent_inline_item_len(leaf,
4635 btrfs_item_nr(leaf, path->slots[0])); 4956 btrfs_item_nr(leaf, path->slots[0]));
@@ -4639,8 +4960,8 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4639 read_extent_buffer(leaf, tmp, ptr, inline_size); 4960 read_extent_buffer(leaf, tmp, ptr, inline_size);
4640 4961
4641 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4962 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4642 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4963 ret = btrfs_decompress(compress_type, tmp, page,
4643 inline_size, max_size); 4964 extent_offset, inline_size, max_size);
4644 if (ret) { 4965 if (ret) {
4645 char *kaddr = kmap_atomic(page, KM_USER0); 4966 char *kaddr = kmap_atomic(page, KM_USER0);
4646 unsigned long copy_size = min_t(u64, 4967 unsigned long copy_size = min_t(u64,
@@ -4682,7 +5003,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4682 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5003 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4683 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5004 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4684 struct btrfs_trans_handle *trans = NULL; 5005 struct btrfs_trans_handle *trans = NULL;
4685 int compressed; 5006 int compress_type;
4686 5007
4687again: 5008again:
4688 read_lock(&em_tree->lock); 5009 read_lock(&em_tree->lock);
@@ -4741,7 +5062,7 @@ again:
4741 5062
4742 found_type = btrfs_file_extent_type(leaf, item); 5063 found_type = btrfs_file_extent_type(leaf, item);
4743 extent_start = found_key.offset; 5064 extent_start = found_key.offset;
4744 compressed = btrfs_file_extent_compression(leaf, item); 5065 compress_type = btrfs_file_extent_compression(leaf, item);
4745 if (found_type == BTRFS_FILE_EXTENT_REG || 5066 if (found_type == BTRFS_FILE_EXTENT_REG ||
4746 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5067 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
4747 extent_end = extent_start + 5068 extent_end = extent_start +
@@ -4787,8 +5108,9 @@ again:
4787 em->block_start = EXTENT_MAP_HOLE; 5108 em->block_start = EXTENT_MAP_HOLE;
4788 goto insert; 5109 goto insert;
4789 } 5110 }
4790 if (compressed) { 5111 if (compress_type != BTRFS_COMPRESS_NONE) {
4791 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5112 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5113 em->compress_type = compress_type;
4792 em->block_start = bytenr; 5114 em->block_start = bytenr;
4793 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5115 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
4794 item); 5116 item);
@@ -4822,12 +5144,14 @@ again:
4822 em->len = (copy_size + root->sectorsize - 1) & 5144 em->len = (copy_size + root->sectorsize - 1) &
4823 ~((u64)root->sectorsize - 1); 5145 ~((u64)root->sectorsize - 1);
4824 em->orig_start = EXTENT_MAP_INLINE; 5146 em->orig_start = EXTENT_MAP_INLINE;
4825 if (compressed) 5147 if (compress_type) {
4826 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5148 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5149 em->compress_type = compress_type;
5150 }
4827 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5151 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
4828 if (create == 0 && !PageUptodate(page)) { 5152 if (create == 0 && !PageUptodate(page)) {
4829 if (btrfs_file_extent_compression(leaf, item) == 5153 if (btrfs_file_extent_compression(leaf, item) !=
4830 BTRFS_COMPRESS_ZLIB) { 5154 BTRFS_COMPRESS_NONE) {
4831 ret = uncompress_inline(path, inode, page, 5155 ret = uncompress_inline(path, inode, page,
4832 pg_offset, 5156 pg_offset,
4833 extent_offset, item); 5157 extent_offset, item);
@@ -4845,6 +5169,7 @@ again:
4845 } 5169 }
4846 flush_dcache_page(page); 5170 flush_dcache_page(page);
4847 } else if (create && PageUptodate(page)) { 5171 } else if (create && PageUptodate(page)) {
5172 WARN_ON(1);
4848 if (!trans) { 5173 if (!trans) {
4849 kunmap(page); 5174 kunmap(page);
4850 free_extent_map(em); 5175 free_extent_map(em);
@@ -4941,11 +5266,823 @@ out:
4941 return em; 5266 return em;
4942} 5267}
4943 5268
5269static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5270 u64 start, u64 len)
5271{
5272 struct btrfs_root *root = BTRFS_I(inode)->root;
5273 struct btrfs_trans_handle *trans;
5274 struct extent_map *em;
5275 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5276 struct btrfs_key ins;
5277 u64 alloc_hint;
5278 int ret;
5279
5280 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5281
5282 trans = btrfs_join_transaction(root, 0);
5283 if (!trans)
5284 return ERR_PTR(-ENOMEM);
5285
5286 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5287
5288 alloc_hint = get_extent_allocation_hint(inode, start, len);
5289 ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
5290 alloc_hint, (u64)-1, &ins, 1);
5291 if (ret) {
5292 em = ERR_PTR(ret);
5293 goto out;
5294 }
5295
5296 em = alloc_extent_map(GFP_NOFS);
5297 if (!em) {
5298 em = ERR_PTR(-ENOMEM);
5299 goto out;
5300 }
5301
5302 em->start = start;
5303 em->orig_start = em->start;
5304 em->len = ins.offset;
5305
5306 em->block_start = ins.objectid;
5307 em->block_len = ins.offset;
5308 em->bdev = root->fs_info->fs_devices->latest_bdev;
5309 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5310
5311 while (1) {
5312 write_lock(&em_tree->lock);
5313 ret = add_extent_mapping(em_tree, em);
5314 write_unlock(&em_tree->lock);
5315 if (ret != -EEXIST)
5316 break;
5317 btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
5318 }
5319
5320 ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
5321 ins.offset, ins.offset, 0);
5322 if (ret) {
5323 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
5324 em = ERR_PTR(ret);
5325 }
5326out:
5327 btrfs_end_transaction(trans, root);
5328 return em;
5329}
5330
5331/*
5332 * returns 1 when the nocow is safe, < 1 on error, 0 if the
5333 * block must be cow'd
5334 */
5335static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5336 struct inode *inode, u64 offset, u64 len)
5337{
5338 struct btrfs_path *path;
5339 int ret;
5340 struct extent_buffer *leaf;
5341 struct btrfs_root *root = BTRFS_I(inode)->root;
5342 struct btrfs_file_extent_item *fi;
5343 struct btrfs_key key;
5344 u64 disk_bytenr;
5345 u64 backref_offset;
5346 u64 extent_end;
5347 u64 num_bytes;
5348 int slot;
5349 int found_type;
5350
5351 path = btrfs_alloc_path();
5352 if (!path)
5353 return -ENOMEM;
5354
5355 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
5356 offset, 0);
5357 if (ret < 0)
5358 goto out;
5359
5360 slot = path->slots[0];
5361 if (ret == 1) {
5362 if (slot == 0) {
5363 /* can't find the item, must cow */
5364 ret = 0;
5365 goto out;
5366 }
5367 slot--;
5368 }
5369 ret = 0;
5370 leaf = path->nodes[0];
5371 btrfs_item_key_to_cpu(leaf, &key, slot);
5372 if (key.objectid != inode->i_ino ||
5373 key.type != BTRFS_EXTENT_DATA_KEY) {
5374 /* not our file or wrong item type, must cow */
5375 goto out;
5376 }
5377
5378 if (key.offset > offset) {
5379 /* Wrong offset, must cow */
5380 goto out;
5381 }
5382
5383 fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
5384 found_type = btrfs_file_extent_type(leaf, fi);
5385 if (found_type != BTRFS_FILE_EXTENT_REG &&
5386 found_type != BTRFS_FILE_EXTENT_PREALLOC) {
5387 /* not a regular extent, must cow */
5388 goto out;
5389 }
5390 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5391 backref_offset = btrfs_file_extent_offset(leaf, fi);
5392
5393 extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
5394 if (extent_end < offset + len) {
5395 /* extent doesn't include our full range, must cow */
5396 goto out;
5397 }
5398
5399 if (btrfs_extent_readonly(root, disk_bytenr))
5400 goto out;
5401
5402 /*
5403 * look for other files referencing this extent, if we
5404 * find any we must cow
5405 */
5406 if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
5407 key.offset - backref_offset, disk_bytenr))
5408 goto out;
5409
5410 /*
5411 * adjust disk_bytenr and num_bytes to cover just the bytes
5412 * in this extent we are about to write. If there
5413 * are any csums in that range we have to cow in order
5414 * to keep the csums correct
5415 */
5416 disk_bytenr += backref_offset;
5417 disk_bytenr += offset - key.offset;
5418 num_bytes = min(offset + len, extent_end) - offset;
5419 if (csum_exist_in_range(root, disk_bytenr, num_bytes))
5420 goto out;
5421 /*
5422 * all of the above have passed, it is safe to overwrite this extent
5423 * without cow
5424 */
5425 ret = 1;
5426out:
5427 btrfs_free_path(path);
5428 return ret;
5429}
5430
5431static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5432 struct buffer_head *bh_result, int create)
5433{
5434 struct extent_map *em;
5435 struct btrfs_root *root = BTRFS_I(inode)->root;
5436 u64 start = iblock << inode->i_blkbits;
5437 u64 len = bh_result->b_size;
5438 struct btrfs_trans_handle *trans;
5439
5440 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
5441 if (IS_ERR(em))
5442 return PTR_ERR(em);
5443
5444 /*
5445 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
5446 * io. INLINE is special, and we could probably kludge it in here, but
5447 * it's still buffered so for safety lets just fall back to the generic
5448 * buffered path.
5449 *
5450 * For COMPRESSED we _have_ to read the entire extent in so we can
5451 * decompress it, so there will be buffering required no matter what we
5452 * do, so go ahead and fallback to buffered.
5453 *
5454 * We return -ENOTBLK because thats what makes DIO go ahead and go back
5455 * to buffered IO. Don't blame me, this is the price we pay for using
5456 * the generic code.
5457 */
5458 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
5459 em->block_start == EXTENT_MAP_INLINE) {
5460 free_extent_map(em);
5461 return -ENOTBLK;
5462 }
5463
5464 /* Just a good old fashioned hole, return */
5465 if (!create && (em->block_start == EXTENT_MAP_HOLE ||
5466 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5467 free_extent_map(em);
5468 /* DIO will do one hole at a time, so just unlock a sector */
5469 unlock_extent(&BTRFS_I(inode)->io_tree, start,
5470 start + root->sectorsize - 1, GFP_NOFS);
5471 return 0;
5472 }
5473
5474 /*
5475 * We don't allocate a new extent in the following cases
5476 *
5477 * 1) The inode is marked as NODATACOW. In this case we'll just use the
5478 * existing extent.
5479 * 2) The extent is marked as PREALLOC. We're good to go here and can
5480 * just use the extent.
5481 *
5482 */
5483 if (!create) {
5484 len = em->len - (start - em->start);
5485 goto map;
5486 }
5487
5488 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
5489 ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
5490 em->block_start != EXTENT_MAP_HOLE)) {
5491 int type;
5492 int ret;
5493 u64 block_start;
5494
5495 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5496 type = BTRFS_ORDERED_PREALLOC;
5497 else
5498 type = BTRFS_ORDERED_NOCOW;
5499 len = min(len, em->len - (start - em->start));
5500 block_start = em->block_start + (start - em->start);
5501
5502 /*
5503 * we're not going to log anything, but we do need
5504 * to make sure the current transaction stays open
5505 * while we look for nocow cross refs
5506 */
5507 trans = btrfs_join_transaction(root, 0);
5508 if (!trans)
5509 goto must_cow;
5510
5511 if (can_nocow_odirect(trans, inode, start, len) == 1) {
5512 ret = btrfs_add_ordered_extent_dio(inode, start,
5513 block_start, len, len, type);
5514 btrfs_end_transaction(trans, root);
5515 if (ret) {
5516 free_extent_map(em);
5517 return ret;
5518 }
5519 goto unlock;
5520 }
5521 btrfs_end_transaction(trans, root);
5522 }
5523must_cow:
5524 /*
5525 * this will cow the extent, reset the len in case we changed
5526 * it above
5527 */
5528 len = bh_result->b_size;
5529 free_extent_map(em);
5530 em = btrfs_new_extent_direct(inode, start, len);
5531 if (IS_ERR(em))
5532 return PTR_ERR(em);
5533 len = min(len, em->len - (start - em->start));
5534unlock:
5535 clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
5536 EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
5537 0, NULL, GFP_NOFS);
5538map:
5539 bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
5540 inode->i_blkbits;
5541 bh_result->b_size = len;
5542 bh_result->b_bdev = em->bdev;
5543 set_buffer_mapped(bh_result);
5544 if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5545 set_buffer_new(bh_result);
5546
5547 free_extent_map(em);
5548
5549 return 0;
5550}
5551
5552struct btrfs_dio_private {
5553 struct inode *inode;
5554 u64 logical_offset;
5555 u64 disk_bytenr;
5556 u64 bytes;
5557 u32 *csums;
5558 void *private;
5559
5560 /* number of bios pending for this dio */
5561 atomic_t pending_bios;
5562
5563 /* IO errors */
5564 int errors;
5565
5566 struct bio *orig_bio;
5567};
5568
5569static void btrfs_endio_direct_read(struct bio *bio, int err)
5570{
5571 struct btrfs_dio_private *dip = bio->bi_private;
5572 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5573 struct bio_vec *bvec = bio->bi_io_vec;
5574 struct inode *inode = dip->inode;
5575 struct btrfs_root *root = BTRFS_I(inode)->root;
5576 u64 start;
5577 u32 *private = dip->csums;
5578
5579 start = dip->logical_offset;
5580 do {
5581 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
5582 struct page *page = bvec->bv_page;
5583 char *kaddr;
5584 u32 csum = ~(u32)0;
5585 unsigned long flags;
5586
5587 local_irq_save(flags);
5588 kaddr = kmap_atomic(page, KM_IRQ0);
5589 csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
5590 csum, bvec->bv_len);
5591 btrfs_csum_final(csum, (char *)&csum);
5592 kunmap_atomic(kaddr, KM_IRQ0);
5593 local_irq_restore(flags);
5594
5595 flush_dcache_page(bvec->bv_page);
5596 if (csum != *private) {
5597 printk(KERN_ERR "btrfs csum failed ino %lu off"
5598 " %llu csum %u private %u\n",
5599 inode->i_ino, (unsigned long long)start,
5600 csum, *private);
5601 err = -EIO;
5602 }
5603 }
5604
5605 start += bvec->bv_len;
5606 private++;
5607 bvec++;
5608 } while (bvec <= bvec_end);
5609
5610 unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
5611 dip->logical_offset + dip->bytes - 1, GFP_NOFS);
5612 bio->bi_private = dip->private;
5613
5614 kfree(dip->csums);
5615 kfree(dip);
5616 dio_end_io(bio, err);
5617}
5618
5619static void btrfs_endio_direct_write(struct bio *bio, int err)
5620{
5621 struct btrfs_dio_private *dip = bio->bi_private;
5622 struct inode *inode = dip->inode;
5623 struct btrfs_root *root = BTRFS_I(inode)->root;
5624 struct btrfs_trans_handle *trans;
5625 struct btrfs_ordered_extent *ordered = NULL;
5626 struct extent_state *cached_state = NULL;
5627 u64 ordered_offset = dip->logical_offset;
5628 u64 ordered_bytes = dip->bytes;
5629 int ret;
5630
5631 if (err)
5632 goto out_done;
5633again:
5634 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5635 &ordered_offset,
5636 ordered_bytes);
5637 if (!ret)
5638 goto out_test;
5639
5640 BUG_ON(!ordered);
5641
5642 trans = btrfs_join_transaction(root, 1);
5643 if (!trans) {
5644 err = -ENOMEM;
5645 goto out;
5646 }
5647 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5648
5649 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5650 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5651 if (!ret)
5652 ret = btrfs_update_inode(trans, root, inode);
5653 err = ret;
5654 goto out;
5655 }
5656
5657 lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5658 ordered->file_offset + ordered->len - 1, 0,
5659 &cached_state, GFP_NOFS);
5660
5661 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
5662 ret = btrfs_mark_extent_written(trans, inode,
5663 ordered->file_offset,
5664 ordered->file_offset +
5665 ordered->len);
5666 if (ret) {
5667 err = ret;
5668 goto out_unlock;
5669 }
5670 } else {
5671 ret = insert_reserved_file_extent(trans, inode,
5672 ordered->file_offset,
5673 ordered->start,
5674 ordered->disk_len,
5675 ordered->len,
5676 ordered->len,
5677 0, 0, 0,
5678 BTRFS_FILE_EXTENT_REG);
5679 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
5680 ordered->file_offset, ordered->len);
5681 if (ret) {
5682 err = ret;
5683 WARN_ON(1);
5684 goto out_unlock;
5685 }
5686 }
5687
5688 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5689 btrfs_ordered_update_i_size(inode, 0, ordered);
5690 btrfs_update_inode(trans, root, inode);
5691out_unlock:
5692 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5693 ordered->file_offset + ordered->len - 1,
5694 &cached_state, GFP_NOFS);
5695out:
5696 btrfs_delalloc_release_metadata(inode, ordered->len);
5697 btrfs_end_transaction(trans, root);
5698 ordered_offset = ordered->file_offset + ordered->len;
5699 btrfs_put_ordered_extent(ordered);
5700 btrfs_put_ordered_extent(ordered);
5701
5702out_test:
5703 /*
5704 * our bio might span multiple ordered extents. If we haven't
5705 * completed the accounting for the whole dio, go back and try again
5706 */
5707 if (ordered_offset < dip->logical_offset + dip->bytes) {
5708 ordered_bytes = dip->logical_offset + dip->bytes -
5709 ordered_offset;
5710 goto again;
5711 }
5712out_done:
5713 bio->bi_private = dip->private;
5714
5715 kfree(dip->csums);
5716 kfree(dip);
5717 dio_end_io(bio, err);
5718}
5719
5720static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5721 struct bio *bio, int mirror_num,
5722 unsigned long bio_flags, u64 offset)
5723{
5724 int ret;
5725 struct btrfs_root *root = BTRFS_I(inode)->root;
5726 ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
5727 BUG_ON(ret);
5728 return 0;
5729}
5730
5731static void btrfs_end_dio_bio(struct bio *bio, int err)
5732{
5733 struct btrfs_dio_private *dip = bio->bi_private;
5734
5735 if (err) {
5736 printk(KERN_ERR "btrfs direct IO failed ino %lu rw %lu "
5737 "sector %#Lx len %u err no %d\n",
5738 dip->inode->i_ino, bio->bi_rw,
5739 (unsigned long long)bio->bi_sector, bio->bi_size, err);
5740 dip->errors = 1;
5741
5742 /*
5743 * before atomic variable goto zero, we must make sure
5744 * dip->errors is perceived to be set.
5745 */
5746 smp_mb__before_atomic_dec();
5747 }
5748
5749 /* if there are more bios still pending for this dio, just exit */
5750 if (!atomic_dec_and_test(&dip->pending_bios))
5751 goto out;
5752
5753 if (dip->errors)
5754 bio_io_error(dip->orig_bio);
5755 else {
5756 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
5757 bio_endio(dip->orig_bio, 0);
5758 }
5759out:
5760 bio_put(bio);
5761}
5762
5763static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5764 u64 first_sector, gfp_t gfp_flags)
5765{
5766 int nr_vecs = bio_get_nr_vecs(bdev);
5767 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
5768}
5769
5770static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5771 int rw, u64 file_offset, int skip_sum,
5772 u32 *csums)
5773{
5774 int write = rw & REQ_WRITE;
5775 struct btrfs_root *root = BTRFS_I(inode)->root;
5776 int ret;
5777
5778 bio_get(bio);
5779 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5780 if (ret)
5781 goto err;
5782
5783 if (write && !skip_sum) {
5784 ret = btrfs_wq_submit_bio(root->fs_info,
5785 inode, rw, bio, 0, 0,
5786 file_offset,
5787 __btrfs_submit_bio_start_direct_io,
5788 __btrfs_submit_bio_done);
5789 goto err;
5790 } else if (!skip_sum)
5791 btrfs_lookup_bio_sums_dio(root, inode, bio,
5792 file_offset, csums);
5793
5794 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5795err:
5796 bio_put(bio);
5797 return ret;
5798}
5799
5800static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5801 int skip_sum)
5802{
5803 struct inode *inode = dip->inode;
5804 struct btrfs_root *root = BTRFS_I(inode)->root;
5805 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5806 struct bio *bio;
5807 struct bio *orig_bio = dip->orig_bio;
5808 struct bio_vec *bvec = orig_bio->bi_io_vec;
5809 u64 start_sector = orig_bio->bi_sector;
5810 u64 file_offset = dip->logical_offset;
5811 u64 submit_len = 0;
5812 u64 map_length;
5813 int nr_pages = 0;
5814 u32 *csums = dip->csums;
5815 int ret = 0;
5816
5817 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
5818 if (!bio)
5819 return -ENOMEM;
5820 bio->bi_private = dip;
5821 bio->bi_end_io = btrfs_end_dio_bio;
5822 atomic_inc(&dip->pending_bios);
5823
5824 map_length = orig_bio->bi_size;
5825 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5826 &map_length, NULL, 0);
5827 if (ret) {
5828 bio_put(bio);
5829 return -EIO;
5830 }
5831
5832 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5833 if (unlikely(map_length < submit_len + bvec->bv_len ||
5834 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5835 bvec->bv_offset) < bvec->bv_len)) {
5836 /*
5837 * inc the count before we submit the bio so
5838 * we know the end IO handler won't happen before
5839 * we inc the count. Otherwise, the dip might get freed
5840 * before we're done setting it up
5841 */
5842 atomic_inc(&dip->pending_bios);
5843 ret = __btrfs_submit_dio_bio(bio, inode, rw,
5844 file_offset, skip_sum,
5845 csums);
5846 if (ret) {
5847 bio_put(bio);
5848 atomic_dec(&dip->pending_bios);
5849 goto out_err;
5850 }
5851
5852 if (!skip_sum)
5853 csums = csums + nr_pages;
5854 start_sector += submit_len >> 9;
5855 file_offset += submit_len;
5856
5857 submit_len = 0;
5858 nr_pages = 0;
5859
5860 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
5861 start_sector, GFP_NOFS);
5862 if (!bio)
5863 goto out_err;
5864 bio->bi_private = dip;
5865 bio->bi_end_io = btrfs_end_dio_bio;
5866
5867 map_length = orig_bio->bi_size;
5868 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5869 &map_length, NULL, 0);
5870 if (ret) {
5871 bio_put(bio);
5872 goto out_err;
5873 }
5874 } else {
5875 submit_len += bvec->bv_len;
5876 nr_pages ++;
5877 bvec++;
5878 }
5879 }
5880
5881 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
5882 csums);
5883 if (!ret)
5884 return 0;
5885
5886 bio_put(bio);
5887out_err:
5888 dip->errors = 1;
5889 /*
5890 * before atomic variable goto zero, we must
5891 * make sure dip->errors is perceived to be set.
5892 */
5893 smp_mb__before_atomic_dec();
5894 if (atomic_dec_and_test(&dip->pending_bios))
5895 bio_io_error(dip->orig_bio);
5896
5897 /* bio_end_io() will handle error, so we needn't return it */
5898 return 0;
5899}
5900
5901static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5902 loff_t file_offset)
5903{
5904 struct btrfs_root *root = BTRFS_I(inode)->root;
5905 struct btrfs_dio_private *dip;
5906 struct bio_vec *bvec = bio->bi_io_vec;
5907 int skip_sum;
5908 int write = rw & REQ_WRITE;
5909 int ret = 0;
5910
5911 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
5912
5913 dip = kmalloc(sizeof(*dip), GFP_NOFS);
5914 if (!dip) {
5915 ret = -ENOMEM;
5916 goto free_ordered;
5917 }
5918 dip->csums = NULL;
5919
5920 if (!skip_sum) {
5921 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5922 if (!dip->csums) {
5923 ret = -ENOMEM;
5924 goto free_ordered;
5925 }
5926 }
5927
5928 dip->private = bio->bi_private;
5929 dip->inode = inode;
5930 dip->logical_offset = file_offset;
5931
5932 dip->bytes = 0;
5933 do {
5934 dip->bytes += bvec->bv_len;
5935 bvec++;
5936 } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
5937
5938 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5939 bio->bi_private = dip;
5940 dip->errors = 0;
5941 dip->orig_bio = bio;
5942 atomic_set(&dip->pending_bios, 0);
5943
5944 if (write)
5945 bio->bi_end_io = btrfs_endio_direct_write;
5946 else
5947 bio->bi_end_io = btrfs_endio_direct_read;
5948
5949 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
5950 if (!ret)
5951 return;
5952free_ordered:
5953 /*
5954 * If this is a write, we need to clean up the reserved space and kill
5955 * the ordered extent.
5956 */
5957 if (write) {
5958 struct btrfs_ordered_extent *ordered;
5959 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
5960 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5961 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5962 btrfs_free_reserved_extent(root, ordered->start,
5963 ordered->disk_len);
5964 btrfs_put_ordered_extent(ordered);
5965 btrfs_put_ordered_extent(ordered);
5966 }
5967 bio_endio(bio, ret);
5968}
5969
5970static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
5971 const struct iovec *iov, loff_t offset,
5972 unsigned long nr_segs)
5973{
5974 int seg;
5975 size_t size;
5976 unsigned long addr;
5977 unsigned blocksize_mask = root->sectorsize - 1;
5978 ssize_t retval = -EINVAL;
5979 loff_t end = offset;
5980
5981 if (offset & blocksize_mask)
5982 goto out;
5983
5984 /* Check the memory alignment. Blocks cannot straddle pages */
5985 for (seg = 0; seg < nr_segs; seg++) {
5986 addr = (unsigned long)iov[seg].iov_base;
5987 size = iov[seg].iov_len;
5988 end += size;
5989 if ((addr & blocksize_mask) || (size & blocksize_mask))
5990 goto out;
5991 }
5992 retval = 0;
5993out:
5994 return retval;
5995}
4944static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, 5996static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
4945 const struct iovec *iov, loff_t offset, 5997 const struct iovec *iov, loff_t offset,
4946 unsigned long nr_segs) 5998 unsigned long nr_segs)
4947{ 5999{
4948 return -EINVAL; 6000 struct file *file = iocb->ki_filp;
6001 struct inode *inode = file->f_mapping->host;
6002 struct btrfs_ordered_extent *ordered;
6003 struct extent_state *cached_state = NULL;
6004 u64 lockstart, lockend;
6005 ssize_t ret;
6006 int writing = rw & WRITE;
6007 int write_bits = 0;
6008 size_t count = iov_length(iov, nr_segs);
6009
6010 if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
6011 offset, nr_segs)) {
6012 return 0;
6013 }
6014
6015 lockstart = offset;
6016 lockend = offset + count - 1;
6017
6018 if (writing) {
6019 ret = btrfs_delalloc_reserve_space(inode, count);
6020 if (ret)
6021 goto out;
6022 }
6023
6024 while (1) {
6025 lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6026 0, &cached_state, GFP_NOFS);
6027 /*
6028 * We're concerned with the entire range that we're going to be
6029 * doing DIO to, so we need to make sure theres no ordered
6030 * extents in this range.
6031 */
6032 ordered = btrfs_lookup_ordered_range(inode, lockstart,
6033 lockend - lockstart + 1);
6034 if (!ordered)
6035 break;
6036 unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6037 &cached_state, GFP_NOFS);
6038 btrfs_start_ordered_extent(inode, ordered, 1);
6039 btrfs_put_ordered_extent(ordered);
6040 cond_resched();
6041 }
6042
6043 /*
6044 * we don't use btrfs_set_extent_delalloc because we don't want
6045 * the dirty or uptodate bits
6046 */
6047 if (writing) {
6048 write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
6049 ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6050 EXTENT_DELALLOC, 0, NULL, &cached_state,
6051 GFP_NOFS);
6052 if (ret) {
6053 clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6054 lockend, EXTENT_LOCKED | write_bits,
6055 1, 0, &cached_state, GFP_NOFS);
6056 goto out;
6057 }
6058 }
6059
6060 free_extent_state(cached_state);
6061 cached_state = NULL;
6062
6063 ret = __blockdev_direct_IO(rw, iocb, inode,
6064 BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
6065 iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
6066 btrfs_submit_direct, 0);
6067
6068 if (ret < 0 && ret != -EIOCBQUEUED) {
6069 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
6070 offset + iov_length(iov, nr_segs) - 1,
6071 EXTENT_LOCKED | write_bits, 1, 0,
6072 &cached_state, GFP_NOFS);
6073 } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
6074 /*
6075 * We're falling back to buffered, unlock the section we didn't
6076 * do IO on.
6077 */
6078 clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
6079 offset + iov_length(iov, nr_segs) - 1,
6080 EXTENT_LOCKED | write_bits, 1, 0,
6081 &cached_state, GFP_NOFS);
6082 }
6083out:
6084 free_extent_state(cached_state);
6085 return ret;
4949} 6086}
4950 6087
4951static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6088static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5021,6 +6158,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5021{ 6158{
5022 struct extent_io_tree *tree; 6159 struct extent_io_tree *tree;
5023 struct btrfs_ordered_extent *ordered; 6160 struct btrfs_ordered_extent *ordered;
6161 struct extent_state *cached_state = NULL;
5024 u64 page_start = page_offset(page); 6162 u64 page_start = page_offset(page);
5025 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 6163 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
5026 6164
@@ -5039,7 +6177,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5039 btrfs_releasepage(page, GFP_NOFS); 6177 btrfs_releasepage(page, GFP_NOFS);
5040 return; 6178 return;
5041 } 6179 }
5042 lock_extent(tree, page_start, page_end, GFP_NOFS); 6180 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
6181 GFP_NOFS);
5043 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 6182 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
5044 page_offset(page)); 6183 page_offset(page));
5045 if (ordered) { 6184 if (ordered) {
@@ -5050,7 +6189,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5050 clear_extent_bit(tree, page_start, page_end, 6189 clear_extent_bit(tree, page_start, page_end,
5051 EXTENT_DIRTY | EXTENT_DELALLOC | 6190 EXTENT_DIRTY | EXTENT_DELALLOC |
5052 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 6191 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
5053 NULL, GFP_NOFS); 6192 &cached_state, GFP_NOFS);
5054 /* 6193 /*
5055 * whoever cleared the private bit is responsible 6194 * whoever cleared the private bit is responsible
5056 * for the finish_ordered_io 6195 * for the finish_ordered_io
@@ -5060,11 +6199,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5060 page_start, page_end); 6199 page_start, page_end);
5061 } 6200 }
5062 btrfs_put_ordered_extent(ordered); 6201 btrfs_put_ordered_extent(ordered);
5063 lock_extent(tree, page_start, page_end, GFP_NOFS); 6202 cached_state = NULL;
6203 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
6204 GFP_NOFS);
5064 } 6205 }
5065 clear_extent_bit(tree, page_start, page_end, 6206 clear_extent_bit(tree, page_start, page_end,
5066 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 6207 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5067 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); 6208 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
5068 __btrfs_releasepage(page, GFP_NOFS); 6209 __btrfs_releasepage(page, GFP_NOFS);
5069 6210
5070 ClearPageChecked(page); 6211 ClearPageChecked(page);
@@ -5097,6 +6238,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5097 struct btrfs_root *root = BTRFS_I(inode)->root; 6238 struct btrfs_root *root = BTRFS_I(inode)->root;
5098 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 6239 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5099 struct btrfs_ordered_extent *ordered; 6240 struct btrfs_ordered_extent *ordered;
6241 struct extent_state *cached_state = NULL;
5100 char *kaddr; 6242 char *kaddr;
5101 unsigned long zero_start; 6243 unsigned long zero_start;
5102 loff_t size; 6244 loff_t size;
@@ -5104,7 +6246,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5104 u64 page_start; 6246 u64 page_start;
5105 u64 page_end; 6247 u64 page_end;
5106 6248
5107 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); 6249 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
5108 if (ret) { 6250 if (ret) {
5109 if (ret == -ENOMEM) 6251 if (ret == -ENOMEM)
5110 ret = VM_FAULT_OOM; 6252 ret = VM_FAULT_OOM;
@@ -5113,13 +6255,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5113 goto out; 6255 goto out;
5114 } 6256 }
5115 6257
5116 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
5117 if (ret) {
5118 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5119 ret = VM_FAULT_SIGBUS;
5120 goto out;
5121 }
5122
5123 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ 6258 ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
5124again: 6259again:
5125 lock_page(page); 6260 lock_page(page);
@@ -5129,13 +6264,13 @@ again:
5129 6264
5130 if ((page->mapping != inode->i_mapping) || 6265 if ((page->mapping != inode->i_mapping) ||
5131 (page_start >= size)) { 6266 (page_start >= size)) {
5132 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5133 /* page got truncated out from underneath us */ 6267 /* page got truncated out from underneath us */
5134 goto out_unlock; 6268 goto out_unlock;
5135 } 6269 }
5136 wait_on_page_writeback(page); 6270 wait_on_page_writeback(page);
5137 6271
5138 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 6272 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
6273 GFP_NOFS);
5139 set_page_extent_mapped(page); 6274 set_page_extent_mapped(page);
5140 6275
5141 /* 6276 /*
@@ -5144,7 +6279,8 @@ again:
5144 */ 6279 */
5145 ordered = btrfs_lookup_ordered_extent(inode, page_start); 6280 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5146 if (ordered) { 6281 if (ordered) {
5147 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 6282 unlock_extent_cached(io_tree, page_start, page_end,
6283 &cached_state, GFP_NOFS);
5148 unlock_page(page); 6284 unlock_page(page);
5149 btrfs_start_ordered_extent(inode, ordered, 1); 6285 btrfs_start_ordered_extent(inode, ordered, 1);
5150 btrfs_put_ordered_extent(ordered); 6286 btrfs_put_ordered_extent(ordered);
@@ -5158,15 +6294,16 @@ again:
5158 * is probably a better way to do this, but for now keep consistent with 6294 * is probably a better way to do this, but for now keep consistent with
5159 * prepare_pages in the normal write path. 6295 * prepare_pages in the normal write path.
5160 */ 6296 */
5161 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 6297 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
5162 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 6298 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5163 GFP_NOFS); 6299 0, 0, &cached_state, GFP_NOFS);
5164 6300
5165 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 6301 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
6302 &cached_state);
5166 if (ret) { 6303 if (ret) {
5167 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 6304 unlock_extent_cached(io_tree, page_start, page_end,
6305 &cached_state, GFP_NOFS);
5168 ret = VM_FAULT_SIGBUS; 6306 ret = VM_FAULT_SIGBUS;
5169 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5170 goto out_unlock; 6307 goto out_unlock;
5171 } 6308 }
5172 ret = 0; 6309 ret = 0;
@@ -5190,13 +6327,13 @@ again:
5190 BTRFS_I(inode)->last_trans = root->fs_info->generation; 6327 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5191 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 6328 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5192 6329
5193 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 6330 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5194 6331
5195out_unlock: 6332out_unlock:
5196 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
5197 if (!ret) 6333 if (!ret)
5198 return VM_FAULT_LOCKED; 6334 return VM_FAULT_LOCKED;
5199 unlock_page(page); 6335 unlock_page(page);
6336 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
5200out: 6337out:
5201 return ret; 6338 return ret;
5202} 6339}
@@ -5221,8 +6358,10 @@ static void btrfs_truncate(struct inode *inode)
5221 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6358 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
5222 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6359 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
5223 6360
5224 trans = btrfs_start_transaction(root, 1); 6361 trans = btrfs_start_transaction(root, 0);
6362 BUG_ON(IS_ERR(trans));
5225 btrfs_set_trans_block_group(trans, inode); 6363 btrfs_set_trans_block_group(trans, inode);
6364 trans->block_rsv = root->orphan_block_rsv;
5226 6365
5227 /* 6366 /*
5228 * setattr is responsible for setting the ordered_data_close flag, 6367 * setattr is responsible for setting the ordered_data_close flag,
@@ -5245,6 +6384,23 @@ static void btrfs_truncate(struct inode *inode)
5245 btrfs_add_ordered_operation(trans, root, inode); 6384 btrfs_add_ordered_operation(trans, root, inode);
5246 6385
5247 while (1) { 6386 while (1) {
6387 if (!trans) {
6388 trans = btrfs_start_transaction(root, 0);
6389 BUG_ON(IS_ERR(trans));
6390 btrfs_set_trans_block_group(trans, inode);
6391 trans->block_rsv = root->orphan_block_rsv;
6392 }
6393
6394 ret = btrfs_block_rsv_check(trans, root,
6395 root->orphan_block_rsv, 0, 5);
6396 if (ret) {
6397 BUG_ON(ret != -EAGAIN);
6398 ret = btrfs_commit_transaction(trans, root);
6399 BUG_ON(ret);
6400 trans = NULL;
6401 continue;
6402 }
6403
5248 ret = btrfs_truncate_inode_items(trans, root, inode, 6404 ret = btrfs_truncate_inode_items(trans, root, inode,
5249 inode->i_size, 6405 inode->i_size,
5250 BTRFS_EXTENT_DATA_KEY); 6406 BTRFS_EXTENT_DATA_KEY);
@@ -5256,10 +6412,8 @@ static void btrfs_truncate(struct inode *inode)
5256 6412
5257 nr = trans->blocks_used; 6413 nr = trans->blocks_used;
5258 btrfs_end_transaction(trans, root); 6414 btrfs_end_transaction(trans, root);
6415 trans = NULL;
5259 btrfs_btree_balance_dirty(root, nr); 6416 btrfs_btree_balance_dirty(root, nr);
5260
5261 trans = btrfs_start_transaction(root, 1);
5262 btrfs_set_trans_block_group(trans, inode);
5263 } 6417 }
5264 6418
5265 if (ret == 0 && inode->i_nlink > 0) { 6419 if (ret == 0 && inode->i_nlink > 0) {
@@ -5320,21 +6474,54 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
5320struct inode *btrfs_alloc_inode(struct super_block *sb) 6474struct inode *btrfs_alloc_inode(struct super_block *sb)
5321{ 6475{
5322 struct btrfs_inode *ei; 6476 struct btrfs_inode *ei;
6477 struct inode *inode;
5323 6478
5324 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); 6479 ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
5325 if (!ei) 6480 if (!ei)
5326 return NULL; 6481 return NULL;
6482
6483 ei->root = NULL;
6484 ei->space_info = NULL;
6485 ei->generation = 0;
6486 ei->sequence = 0;
5327 ei->last_trans = 0; 6487 ei->last_trans = 0;
5328 ei->last_sub_trans = 0; 6488 ei->last_sub_trans = 0;
5329 ei->logged_trans = 0; 6489 ei->logged_trans = 0;
5330 ei->outstanding_extents = 0; 6490 ei->delalloc_bytes = 0;
5331 ei->reserved_extents = 0; 6491 ei->reserved_bytes = 0;
5332 ei->root = NULL; 6492 ei->disk_i_size = 0;
6493 ei->flags = 0;
6494 ei->index_cnt = (u64)-1;
6495 ei->last_unlink_trans = 0;
6496
5333 spin_lock_init(&ei->accounting_lock); 6497 spin_lock_init(&ei->accounting_lock);
6498 atomic_set(&ei->outstanding_extents, 0);
6499 ei->reserved_extents = 0;
6500
6501 ei->ordered_data_close = 0;
6502 ei->orphan_meta_reserved = 0;
6503 ei->dummy_inode = 0;
6504 ei->force_compress = BTRFS_COMPRESS_NONE;
6505
6506 inode = &ei->vfs_inode;
6507 extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
6508 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
6509 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
6510 mutex_init(&ei->log_mutex);
5334 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6511 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
5335 INIT_LIST_HEAD(&ei->i_orphan); 6512 INIT_LIST_HEAD(&ei->i_orphan);
6513 INIT_LIST_HEAD(&ei->delalloc_inodes);
5336 INIT_LIST_HEAD(&ei->ordered_operations); 6514 INIT_LIST_HEAD(&ei->ordered_operations);
5337 return &ei->vfs_inode; 6515 RB_CLEAR_NODE(&ei->rb_node);
6516
6517 return inode;
6518}
6519
6520static void btrfs_i_callback(struct rcu_head *head)
6521{
6522 struct inode *inode = container_of(head, struct inode, i_rcu);
6523 INIT_LIST_HEAD(&inode->i_dentry);
6524 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5338} 6525}
5339 6526
5340void btrfs_destroy_inode(struct inode *inode) 6527void btrfs_destroy_inode(struct inode *inode)
@@ -5344,6 +6531,8 @@ void btrfs_destroy_inode(struct inode *inode)
5344 6531
5345 WARN_ON(!list_empty(&inode->i_dentry)); 6532 WARN_ON(!list_empty(&inode->i_dentry));
5346 WARN_ON(inode->i_data.nrpages); 6533 WARN_ON(inode->i_data.nrpages);
6534 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6535 WARN_ON(BTRFS_I(inode)->reserved_extents);
5347 6536
5348 /* 6537 /*
5349 * This can happen where we create an inode, but somebody else also 6538 * This can happen where we create an inode, but somebody else also
@@ -5364,13 +6553,28 @@ void btrfs_destroy_inode(struct inode *inode)
5364 spin_unlock(&root->fs_info->ordered_extent_lock); 6553 spin_unlock(&root->fs_info->ordered_extent_lock);
5365 } 6554 }
5366 6555
5367 spin_lock(&root->list_lock); 6556 if (root == root->fs_info->tree_root) {
6557 struct btrfs_block_group_cache *block_group;
6558
6559 block_group = btrfs_lookup_block_group(root->fs_info,
6560 BTRFS_I(inode)->block_group);
6561 if (block_group && block_group->inode == inode) {
6562 spin_lock(&block_group->lock);
6563 block_group->inode = NULL;
6564 spin_unlock(&block_group->lock);
6565 btrfs_put_block_group(block_group);
6566 } else if (block_group) {
6567 btrfs_put_block_group(block_group);
6568 }
6569 }
6570
6571 spin_lock(&root->orphan_lock);
5368 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6572 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
5369 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6573 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
5370 inode->i_ino); 6574 inode->i_ino);
5371 list_del_init(&BTRFS_I(inode)->i_orphan); 6575 list_del_init(&BTRFS_I(inode)->i_orphan);
5372 } 6576 }
5373 spin_unlock(&root->list_lock); 6577 spin_unlock(&root->orphan_lock);
5374 6578
5375 while (1) { 6579 while (1) {
5376 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); 6580 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5389,17 +6593,18 @@ void btrfs_destroy_inode(struct inode *inode)
5389 inode_tree_del(inode); 6593 inode_tree_del(inode);
5390 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 6594 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
5391free: 6595free:
5392 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6596 call_rcu(&inode->i_rcu, btrfs_i_callback);
5393} 6597}
5394 6598
5395void btrfs_drop_inode(struct inode *inode) 6599int btrfs_drop_inode(struct inode *inode)
5396{ 6600{
5397 struct btrfs_root *root = BTRFS_I(inode)->root; 6601 struct btrfs_root *root = BTRFS_I(inode)->root;
5398 6602
5399 if (inode->i_nlink > 0 && btrfs_root_refs(&root->root_item) == 0) 6603 if (btrfs_root_refs(&root->root_item) == 0 &&
5400 generic_delete_inode(inode); 6604 root != root->fs_info->tree_root)
6605 return 1;
5401 else 6606 else
5402 generic_drop_inode(inode); 6607 return generic_drop_inode(inode);
5403} 6608}
5404 6609
5405static void init_once(void *foo) 6610static void init_once(void *foo)
@@ -5492,19 +6697,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5492 if (S_ISDIR(old_inode->i_mode) && new_inode && 6697 if (S_ISDIR(old_inode->i_mode) && new_inode &&
5493 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) 6698 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
5494 return -ENOTEMPTY; 6699 return -ENOTEMPTY;
5495
5496 /*
5497 * We want to reserve the absolute worst case amount of items. So if
5498 * both inodes are subvols and we need to unlink them then that would
5499 * require 4 item modifications, but if they are both normal inodes it
5500 * would require 5 item modifications, so we'll assume their normal
5501 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
5502 * should cover the worst case number of items we'll modify.
5503 */
5504 ret = btrfs_reserve_metadata_space(root, 11);
5505 if (ret)
5506 return ret;
5507
5508 /* 6700 /*
5509 * we're using rename to replace one file with another. 6701 * we're using rename to replace one file with another.
5510 * and the replacement file is large. Start IO on it now so 6702 * and the replacement file is large. Start IO on it now so
@@ -5517,8 +6709,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5517 /* close the racy window with snapshot create/destroy ioctl */ 6709 /* close the racy window with snapshot create/destroy ioctl */
5518 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6710 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5519 down_read(&root->fs_info->subvol_sem); 6711 down_read(&root->fs_info->subvol_sem);
6712 /*
6713 * We want to reserve the absolute worst case amount of items. So if
6714 * both inodes are subvols and we need to unlink them then that would
6715 * require 4 item modifications, but if they are both normal inodes it
6716 * would require 5 item modifications, so we'll assume their normal
6717 * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items
6718 * should cover the worst case number of items we'll modify.
6719 */
6720 trans = btrfs_start_transaction(root, 20);
6721 if (IS_ERR(trans))
6722 return PTR_ERR(trans);
5520 6723
5521 trans = btrfs_start_transaction(root, 1);
5522 btrfs_set_trans_block_group(trans, new_dir); 6724 btrfs_set_trans_block_group(trans, new_dir);
5523 6725
5524 if (dest != root) 6726 if (dest != root)
@@ -5607,8 +6809,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
5607 BUG_ON(ret); 6809 BUG_ON(ret);
5608 6810
5609 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 6811 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
5610 btrfs_log_new_name(trans, old_inode, old_dir, 6812 struct dentry *parent = dget_parent(new_dentry);
5611 new_dentry->d_parent); 6813 btrfs_log_new_name(trans, old_inode, old_dir, parent);
6814 dput(parent);
5612 btrfs_end_log_trans(root); 6815 btrfs_end_log_trans(root);
5613 } 6816 }
5614out_fail: 6817out_fail:
@@ -5617,7 +6820,6 @@ out_fail:
5617 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6820 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
5618 up_read(&root->fs_info->subvol_sem); 6821 up_read(&root->fs_info->subvol_sem);
5619 6822
5620 btrfs_unreserve_metadata_space(root, 11);
5621 return ret; 6823 return ret;
5622} 6824}
5623 6825
@@ -5669,6 +6871,58 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
5669 return 0; 6871 return 0;
5670} 6872}
5671 6873
6874int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput,
6875 int sync)
6876{
6877 struct btrfs_inode *binode;
6878 struct inode *inode = NULL;
6879
6880 spin_lock(&root->fs_info->delalloc_lock);
6881 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6882 binode = list_entry(root->fs_info->delalloc_inodes.next,
6883 struct btrfs_inode, delalloc_inodes);
6884 inode = igrab(&binode->vfs_inode);
6885 if (inode) {
6886 list_move_tail(&binode->delalloc_inodes,
6887 &root->fs_info->delalloc_inodes);
6888 break;
6889 }
6890
6891 list_del_init(&binode->delalloc_inodes);
6892 cond_resched_lock(&root->fs_info->delalloc_lock);
6893 }
6894 spin_unlock(&root->fs_info->delalloc_lock);
6895
6896 if (inode) {
6897 if (sync) {
6898 filemap_write_and_wait(inode->i_mapping);
6899 /*
6900 * We have to do this because compression doesn't
6901 * actually set PG_writeback until it submits the pages
6902 * for IO, which happens in an async thread, so we could
6903 * race and not actually wait for any writeback pages
6904 * because they've not been submitted yet. Technically
6905 * this could still be the case for the ordered stuff
6906 * since the async thread may not have started to do its
6907 * work yet. If this becomes the case then we need to
6908 * figure out a way to make sure that in writepage we
6909 * wait for any async pages to be submitted before
6910 * returning so that fdatawait does what its supposed to
6911 * do.
6912 */
6913 btrfs_wait_ordered_range(inode, 0, (u64)-1);
6914 } else {
6915 filemap_flush(inode->i_mapping);
6916 }
6917 if (delay_iput)
6918 btrfs_add_delayed_iput(inode);
6919 else
6920 iput(inode);
6921 return 1;
6922 }
6923 return 0;
6924}
6925
5672static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 6926static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5673 const char *symname) 6927 const char *symname)
5674{ 6928{
@@ -5692,29 +6946,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5692 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 6946 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
5693 return -ENAMETOOLONG; 6947 return -ENAMETOOLONG;
5694 6948
6949 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6950 if (err)
6951 return err;
5695 /* 6952 /*
5696 * 2 items for inode item and ref 6953 * 2 items for inode item and ref
5697 * 2 items for dir items 6954 * 2 items for dir items
5698 * 1 item for xattr if selinux is on 6955 * 1 item for xattr if selinux is on
5699 */ 6956 */
5700 err = btrfs_reserve_metadata_space(root, 5); 6957 trans = btrfs_start_transaction(root, 5);
5701 if (err) 6958 if (IS_ERR(trans))
5702 return err; 6959 return PTR_ERR(trans);
5703 6960
5704 trans = btrfs_start_transaction(root, 1);
5705 if (!trans)
5706 goto out_fail;
5707 btrfs_set_trans_block_group(trans, dir); 6961 btrfs_set_trans_block_group(trans, dir);
5708 6962
5709 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
5710 if (err) {
5711 err = -ENOSPC;
5712 goto out_unlock;
5713 }
5714
5715 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 6963 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5716 dentry->d_name.len, 6964 dentry->d_name.len, dir->i_ino, objectid,
5717 dentry->d_parent->d_inode->i_ino, objectid,
5718 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 6965 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
5719 &index); 6966 &index);
5720 err = PTR_ERR(inode); 6967 err = PTR_ERR(inode);
@@ -5728,7 +6975,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5728 } 6975 }
5729 6976
5730 btrfs_set_trans_block_group(trans, inode); 6977 btrfs_set_trans_block_group(trans, inode);
5731 err = btrfs_add_nondir(trans, dentry, inode, 0, index); 6978 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5732 if (err) 6979 if (err)
5733 drop_inode = 1; 6980 drop_inode = 1;
5734 else { 6981 else {
@@ -5783,8 +7030,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
5783out_unlock: 7030out_unlock:
5784 nr = trans->blocks_used; 7031 nr = trans->blocks_used;
5785 btrfs_end_transaction_throttle(trans, root); 7032 btrfs_end_transaction_throttle(trans, root);
5786out_fail:
5787 btrfs_unreserve_metadata_space(root, 5);
5788 if (drop_inode) { 7033 if (drop_inode) {
5789 inode_dec_link_count(inode); 7034 inode_dec_link_count(inode);
5790 iput(inode); 7035 iput(inode);
@@ -5793,36 +7038,35 @@ out_fail:
5793 return err; 7038 return err;
5794} 7039}
5795 7040
5796static int prealloc_file_range(struct inode *inode, u64 start, u64 end, 7041static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
5797 u64 alloc_hint, int mode, loff_t actual_len) 7042 u64 start, u64 num_bytes, u64 min_size,
7043 loff_t actual_len, u64 *alloc_hint,
7044 struct btrfs_trans_handle *trans)
5798{ 7045{
5799 struct btrfs_trans_handle *trans;
5800 struct btrfs_root *root = BTRFS_I(inode)->root; 7046 struct btrfs_root *root = BTRFS_I(inode)->root;
5801 struct btrfs_key ins; 7047 struct btrfs_key ins;
5802 u64 alloc_size;
5803 u64 cur_offset = start; 7048 u64 cur_offset = start;
5804 u64 num_bytes = end - start;
5805 int ret = 0;
5806 u64 i_size; 7049 u64 i_size;
7050 int ret = 0;
7051 bool own_trans = true;
5807 7052
7053 if (trans)
7054 own_trans = false;
5808 while (num_bytes > 0) { 7055 while (num_bytes > 0) {
5809 alloc_size = min(num_bytes, root->fs_info->max_extent); 7056 if (own_trans) {
5810 7057 trans = btrfs_start_transaction(root, 3);
5811 trans = btrfs_start_transaction(root, 1); 7058 if (IS_ERR(trans)) {
5812 7059 ret = PTR_ERR(trans);
5813 ret = btrfs_reserve_extent(trans, root, alloc_size, 7060 break;
5814 root->sectorsize, 0, alloc_hint, 7061 }
5815 (u64)-1, &ins, 1);
5816 if (ret) {
5817 WARN_ON(1);
5818 goto stop_trans;
5819 } 7062 }
5820 7063
5821 ret = btrfs_reserve_metadata_space(root, 3); 7064 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
7065 0, *alloc_hint, (u64)-1, &ins, 1);
5822 if (ret) { 7066 if (ret) {
5823 btrfs_free_reserved_extent(root, ins.objectid, 7067 if (own_trans)
5824 ins.offset); 7068 btrfs_end_transaction(trans, root);
5825 goto stop_trans; 7069 break;
5826 } 7070 }
5827 7071
5828 ret = insert_reserved_file_extent(trans, inode, 7072 ret = insert_reserved_file_extent(trans, inode,
@@ -5836,14 +7080,15 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5836 7080
5837 num_bytes -= ins.offset; 7081 num_bytes -= ins.offset;
5838 cur_offset += ins.offset; 7082 cur_offset += ins.offset;
5839 alloc_hint = ins.objectid + ins.offset; 7083 *alloc_hint = ins.objectid + ins.offset;
5840 7084
5841 inode->i_ctime = CURRENT_TIME; 7085 inode->i_ctime = CURRENT_TIME;
5842 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; 7086 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
5843 if (!(mode & FALLOC_FL_KEEP_SIZE) && 7087 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
5844 cur_offset > inode->i_size) { 7088 (actual_len > inode->i_size) &&
7089 (cur_offset > inode->i_size)) {
5845 if (cur_offset > actual_len) 7090 if (cur_offset > actual_len)
5846 i_size = actual_len; 7091 i_size = actual_len;
5847 else 7092 else
5848 i_size = cur_offset; 7093 i_size = cur_offset;
5849 i_size_write(inode, i_size); 7094 i_size_write(inode, i_size);
@@ -5853,117 +7098,28 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
5853 ret = btrfs_update_inode(trans, root, inode); 7098 ret = btrfs_update_inode(trans, root, inode);
5854 BUG_ON(ret); 7099 BUG_ON(ret);
5855 7100
5856 btrfs_end_transaction(trans, root); 7101 if (own_trans)
5857 btrfs_unreserve_metadata_space(root, 3); 7102 btrfs_end_transaction(trans, root);
5858 } 7103 }
5859 return ret; 7104 return ret;
5860
5861stop_trans:
5862 btrfs_end_transaction(trans, root);
5863 return ret;
5864
5865} 7105}
5866 7106
5867static long btrfs_fallocate(struct inode *inode, int mode, 7107int btrfs_prealloc_file_range(struct inode *inode, int mode,
5868 loff_t offset, loff_t len) 7108 u64 start, u64 num_bytes, u64 min_size,
7109 loff_t actual_len, u64 *alloc_hint)
5869{ 7110{
5870 u64 cur_offset; 7111 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
5871 u64 last_byte; 7112 min_size, actual_len, alloc_hint,
5872 u64 alloc_start; 7113 NULL);
5873 u64 alloc_end; 7114}
5874 u64 alloc_hint = 0;
5875 u64 locked_end;
5876 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
5877 struct extent_map *em;
5878 int ret;
5879
5880 alloc_start = offset & ~mask;
5881 alloc_end = (offset + len + mask) & ~mask;
5882
5883 /*
5884 * wait for ordered IO before we have any locks. We'll loop again
5885 * below with the locks held.
5886 */
5887 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
5888
5889 mutex_lock(&inode->i_mutex);
5890 if (alloc_start > inode->i_size) {
5891 ret = btrfs_cont_expand(inode, alloc_start);
5892 if (ret)
5893 goto out;
5894 }
5895
5896 ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
5897 alloc_end - alloc_start);
5898 if (ret)
5899 goto out;
5900
5901 locked_end = alloc_end - 1;
5902 while (1) {
5903 struct btrfs_ordered_extent *ordered;
5904
5905 /* the extent lock is ordered inside the running
5906 * transaction
5907 */
5908 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5909 GFP_NOFS);
5910 ordered = btrfs_lookup_first_ordered_extent(inode,
5911 alloc_end - 1);
5912 if (ordered &&
5913 ordered->file_offset + ordered->len > alloc_start &&
5914 ordered->file_offset < alloc_end) {
5915 btrfs_put_ordered_extent(ordered);
5916 unlock_extent(&BTRFS_I(inode)->io_tree,
5917 alloc_start, locked_end, GFP_NOFS);
5918 /*
5919 * we can't wait on the range with the transaction
5920 * running or with the extent lock held
5921 */
5922 btrfs_wait_ordered_range(inode, alloc_start,
5923 alloc_end - alloc_start);
5924 } else {
5925 if (ordered)
5926 btrfs_put_ordered_extent(ordered);
5927 break;
5928 }
5929 }
5930
5931 cur_offset = alloc_start;
5932 while (1) {
5933 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
5934 alloc_end - cur_offset, 0);
5935 BUG_ON(IS_ERR(em) || !em);
5936 last_byte = min(extent_map_end(em), alloc_end);
5937 last_byte = (last_byte + mask) & ~mask;
5938 if (em->block_start == EXTENT_MAP_HOLE ||
5939 (cur_offset >= inode->i_size &&
5940 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
5941 ret = prealloc_file_range(inode,
5942 cur_offset, last_byte,
5943 alloc_hint, mode, offset+len);
5944 if (ret < 0) {
5945 free_extent_map(em);
5946 break;
5947 }
5948 }
5949 if (em->block_start <= EXTENT_MAP_LAST_BYTE)
5950 alloc_hint = em->block_start;
5951 free_extent_map(em);
5952
5953 cur_offset = last_byte;
5954 if (cur_offset >= alloc_end) {
5955 ret = 0;
5956 break;
5957 }
5958 }
5959 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5960 GFP_NOFS);
5961 7115
5962 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 7116int btrfs_prealloc_file_range_trans(struct inode *inode,
5963 alloc_end - alloc_start); 7117 struct btrfs_trans_handle *trans, int mode,
5964out: 7118 u64 start, u64 num_bytes, u64 min_size,
5965 mutex_unlock(&inode->i_mutex); 7119 loff_t actual_len, u64 *alloc_hint)
5966 return ret; 7120{
7121 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
7122 min_size, actual_len, alloc_hint, trans);
5967} 7123}
5968 7124
5969static int btrfs_set_page_dirty(struct page *page) 7125static int btrfs_set_page_dirty(struct page *page)
@@ -5971,11 +7127,15 @@ static int btrfs_set_page_dirty(struct page *page)
5971 return __set_page_dirty_nobuffers(page); 7127 return __set_page_dirty_nobuffers(page);
5972} 7128}
5973 7129
5974static int btrfs_permission(struct inode *inode, int mask) 7130static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
5975{ 7131{
7132 struct btrfs_root *root = BTRFS_I(inode)->root;
7133
7134 if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
7135 return -EROFS;
5976 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7136 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
5977 return -EACCES; 7137 return -EACCES;
5978 return generic_permission(inode, mask, btrfs_check_acl); 7138 return generic_permission(inode, mask, flags, btrfs_check_acl);
5979} 7139}
5980 7140
5981static const struct inode_operations btrfs_dir_inode_operations = { 7141static const struct inode_operations btrfs_dir_inode_operations = {
@@ -6068,7 +7228,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
6068 .listxattr = btrfs_listxattr, 7228 .listxattr = btrfs_listxattr,
6069 .removexattr = btrfs_removexattr, 7229 .removexattr = btrfs_removexattr,
6070 .permission = btrfs_permission, 7230 .permission = btrfs_permission,
6071 .fallocate = btrfs_fallocate,
6072 .fiemap = btrfs_fiemap, 7231 .fiemap = btrfs_fiemap,
6073}; 7232};
6074static const struct inode_operations btrfs_special_inode_operations = { 7233static const struct inode_operations btrfs_special_inode_operations = {
@@ -6084,6 +7243,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
6084 .readlink = generic_readlink, 7243 .readlink = generic_readlink,
6085 .follow_link = page_follow_link_light, 7244 .follow_link = page_follow_link_light,
6086 .put_link = page_put_link, 7245 .put_link = page_put_link,
7246 .getattr = btrfs_getattr,
6087 .permission = btrfs_permission, 7247 .permission = btrfs_permission,
6088 .setxattr = btrfs_setxattr, 7248 .setxattr = btrfs_setxattr,
6089 .getxattr = btrfs_getxattr, 7249 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..a506a22b522a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -39,6 +39,7 @@
39#include <linux/security.h> 39#include <linux/security.h>
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h>
42#include "compat.h" 43#include "compat.h"
43#include "ctree.h" 44#include "ctree.h"
44#include "disk-io.h" 45#include "disk-io.h"
@@ -146,6 +147,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
146 unsigned int flags, oldflags; 147 unsigned int flags, oldflags;
147 int ret; 148 int ret;
148 149
150 if (btrfs_root_readonly(root))
151 return -EROFS;
152
149 if (copy_from_user(&flags, arg, sizeof(flags))) 153 if (copy_from_user(&flags, arg, sizeof(flags)))
150 return -EFAULT; 154 return -EFAULT;
151 155
@@ -223,7 +227,8 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
223 227
224static noinline int create_subvol(struct btrfs_root *root, 228static noinline int create_subvol(struct btrfs_root *root,
225 struct dentry *dentry, 229 struct dentry *dentry,
226 char *name, int namelen) 230 char *name, int namelen,
231 u64 *async_transid)
227{ 232{
228 struct btrfs_trans_handle *trans; 233 struct btrfs_trans_handle *trans;
229 struct btrfs_key key; 234 struct btrfs_key key;
@@ -231,30 +236,34 @@ static noinline int create_subvol(struct btrfs_root *root,
231 struct btrfs_inode_item *inode_item; 236 struct btrfs_inode_item *inode_item;
232 struct extent_buffer *leaf; 237 struct extent_buffer *leaf;
233 struct btrfs_root *new_root; 238 struct btrfs_root *new_root;
234 struct inode *dir = dentry->d_parent->d_inode; 239 struct dentry *parent = dget_parent(dentry);
240 struct inode *dir;
235 int ret; 241 int ret;
236 int err; 242 int err;
237 u64 objectid; 243 u64 objectid;
238 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 244 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
239 u64 index = 0; 245 u64 index = 0;
240 246
247 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
248 0, &objectid);
249 if (ret) {
250 dput(parent);
251 return ret;
252 }
253
254 dir = parent->d_inode;
255
241 /* 256 /*
242 * 1 - inode item 257 * 1 - inode item
243 * 2 - refs 258 * 2 - refs
244 * 1 - root item 259 * 1 - root item
245 * 2 - dir items 260 * 2 - dir items
246 */ 261 */
247 ret = btrfs_reserve_metadata_space(root, 6); 262 trans = btrfs_start_transaction(root, 6);
248 if (ret) 263 if (IS_ERR(trans)) {
249 return ret; 264 dput(parent);
250 265 return PTR_ERR(trans);
251 trans = btrfs_start_transaction(root, 1); 266 }
252 BUG_ON(!trans);
253
254 ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
255 0, &objectid);
256 if (ret)
257 goto fail;
258 267
259 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 268 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
260 0, objectid, NULL, 0, 0, 0); 269 0, objectid, NULL, 0, 0, 0);
@@ -341,18 +350,24 @@ static noinline int create_subvol(struct btrfs_root *root,
341 350
342 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 351 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
343fail: 352fail:
344 err = btrfs_commit_transaction(trans, root); 353 dput(parent);
354 if (async_transid) {
355 *async_transid = trans->transid;
356 err = btrfs_commit_transaction_async(trans, root, 1);
357 } else {
358 err = btrfs_commit_transaction(trans, root);
359 }
345 if (err && !ret) 360 if (err && !ret)
346 ret = err; 361 ret = err;
347
348 btrfs_unreserve_metadata_space(root, 6);
349 return ret; 362 return ret;
350} 363}
351 364
352static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, 365static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
353 char *name, int namelen) 366 char *name, int namelen, u64 *async_transid,
367 bool readonly)
354{ 368{
355 struct inode *inode; 369 struct inode *inode;
370 struct dentry *parent;
356 struct btrfs_pending_snapshot *pending_snapshot; 371 struct btrfs_pending_snapshot *pending_snapshot;
357 struct btrfs_trans_handle *trans; 372 struct btrfs_trans_handle *trans;
358 int ret; 373 int ret;
@@ -360,42 +375,45 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
360 if (!root->ref_cows) 375 if (!root->ref_cows)
361 return -EINVAL; 376 return -EINVAL;
362 377
363 /*
364 * 1 - inode item
365 * 2 - refs
366 * 1 - root item
367 * 2 - dir items
368 */
369 ret = btrfs_reserve_metadata_space(root, 6);
370 if (ret)
371 goto fail;
372
373 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); 378 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
374 if (!pending_snapshot) { 379 if (!pending_snapshot)
375 ret = -ENOMEM; 380 return -ENOMEM;
376 btrfs_unreserve_metadata_space(root, 6); 381
377 goto fail; 382 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
378 }
379 pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
380 if (!pending_snapshot->name) {
381 ret = -ENOMEM;
382 kfree(pending_snapshot);
383 btrfs_unreserve_metadata_space(root, 6);
384 goto fail;
385 }
386 memcpy(pending_snapshot->name, name, namelen);
387 pending_snapshot->name[namelen] = '\0';
388 pending_snapshot->dentry = dentry; 383 pending_snapshot->dentry = dentry;
389 trans = btrfs_start_transaction(root, 1);
390 BUG_ON(!trans);
391 pending_snapshot->root = root; 384 pending_snapshot->root = root;
385 pending_snapshot->readonly = readonly;
386
387 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
388 if (IS_ERR(trans)) {
389 ret = PTR_ERR(trans);
390 goto fail;
391 }
392
393 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
394 BUG_ON(ret);
395
392 list_add(&pending_snapshot->list, 396 list_add(&pending_snapshot->list,
393 &trans->transaction->pending_snapshots); 397 &trans->transaction->pending_snapshots);
394 ret = btrfs_commit_transaction(trans, root); 398 if (async_transid) {
399 *async_transid = trans->transid;
400 ret = btrfs_commit_transaction_async(trans,
401 root->fs_info->extent_root, 1);
402 } else {
403 ret = btrfs_commit_transaction(trans,
404 root->fs_info->extent_root);
405 }
395 BUG_ON(ret); 406 BUG_ON(ret);
396 btrfs_unreserve_metadata_space(root, 6);
397 407
398 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 408 ret = pending_snapshot->error;
409 if (ret)
410 goto fail;
411
412 btrfs_orphan_cleanup(pending_snapshot->snap);
413
414 parent = dget_parent(dentry);
415 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
416 dput(parent);
399 if (IS_ERR(inode)) { 417 if (IS_ERR(inode)) {
400 ret = PTR_ERR(inode); 418 ret = PTR_ERR(inode);
401 goto fail; 419 goto fail;
@@ -404,9 +422,80 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
404 d_instantiate(dentry, inode); 422 d_instantiate(dentry, inode);
405 ret = 0; 423 ret = 0;
406fail: 424fail:
425 kfree(pending_snapshot);
407 return ret; 426 return ret;
408} 427}
409 428
429/* copy of check_sticky in fs/namei.c()
430* It's inline, so penalty for filesystems that don't use sticky bit is
431* minimal.
432*/
433static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
434{
435 uid_t fsuid = current_fsuid();
436
437 if (!(dir->i_mode & S_ISVTX))
438 return 0;
439 if (inode->i_uid == fsuid)
440 return 0;
441 if (dir->i_uid == fsuid)
442 return 0;
443 return !capable(CAP_FOWNER);
444}
445
446/* copy of may_delete in fs/namei.c()
447 * Check whether we can remove a link victim from directory dir, check
448 * whether the type of victim is right.
449 * 1. We can't do it if dir is read-only (done in permission())
450 * 2. We should have write and exec permissions on dir
451 * 3. We can't remove anything from append-only dir
452 * 4. We can't do anything with immutable dir (done in permission())
453 * 5. If the sticky bit on dir is set we should either
454 * a. be owner of dir, or
455 * b. be owner of victim, or
456 * c. have CAP_FOWNER capability
457 * 6. If the victim is append-only or immutable we can't do antyhing with
458 * links pointing to it.
459 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
460 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
461 * 9. We can't remove a root or mountpoint.
462 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
463 * nfs_async_unlink().
464 */
465
466static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
467{
468 int error;
469
470 if (!victim->d_inode)
471 return -ENOENT;
472
473 BUG_ON(victim->d_parent->d_inode != dir);
474 audit_inode_child(victim, dir);
475
476 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
477 if (error)
478 return error;
479 if (IS_APPEND(dir))
480 return -EPERM;
481 if (btrfs_check_sticky(dir, victim->d_inode)||
482 IS_APPEND(victim->d_inode)||
483 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
484 return -EPERM;
485 if (isdir) {
486 if (!S_ISDIR(victim->d_inode->i_mode))
487 return -ENOTDIR;
488 if (IS_ROOT(victim))
489 return -EBUSY;
490 } else if (S_ISDIR(victim->d_inode->i_mode))
491 return -EISDIR;
492 if (IS_DEADDIR(dir))
493 return -ENOENT;
494 if (victim->d_flags & DCACHE_NFSFS_RENAMED)
495 return -EBUSY;
496 return 0;
497}
498
410/* copy of may_create in fs/namei.c() */ 499/* copy of may_create in fs/namei.c() */
411static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 500static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
412{ 501{
@@ -424,7 +513,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
424 */ 513 */
425static noinline int btrfs_mksubvol(struct path *parent, 514static noinline int btrfs_mksubvol(struct path *parent,
426 char *name, int namelen, 515 char *name, int namelen,
427 struct btrfs_root *snap_src) 516 struct btrfs_root *snap_src,
517 u64 *async_transid, bool readonly)
428{ 518{
429 struct inode *dir = parent->dentry->d_inode; 519 struct inode *dir = parent->dentry->d_inode;
430 struct dentry *dentry; 520 struct dentry *dentry;
@@ -456,10 +546,10 @@ static noinline int btrfs_mksubvol(struct path *parent,
456 546
457 if (snap_src) { 547 if (snap_src) {
458 error = create_snapshot(snap_src, dentry, 548 error = create_snapshot(snap_src, dentry,
459 name, namelen); 549 name, namelen, async_transid, readonly);
460 } else { 550 } else {
461 error = create_subvol(BTRFS_I(dir)->root, dentry, 551 error = create_subvol(BTRFS_I(dir)->root, dentry,
462 name, namelen); 552 name, namelen, async_transid);
463 } 553 }
464 if (!error) 554 if (!error)
465 fsnotify_mkdir(dir, dentry); 555 fsnotify_mkdir(dir, dentry);
@@ -474,49 +564,182 @@ out_unlock:
474 return error; 564 return error;
475} 565}
476 566
477static int btrfs_defrag_file(struct file *file) 567static int should_defrag_range(struct inode *inode, u64 start, u64 len,
568 int thresh, u64 *last_len, u64 *skip,
569 u64 *defrag_end)
570{
571 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
572 struct extent_map *em = NULL;
573 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
574 int ret = 1;
575
576
577 if (thresh == 0)
578 thresh = 256 * 1024;
579
580 /*
581 * make sure that once we start defragging and extent, we keep on
582 * defragging it
583 */
584 if (start < *defrag_end)
585 return 1;
586
587 *skip = 0;
588
589 /*
590 * hopefully we have this extent in the tree already, try without
591 * the full extent lock
592 */
593 read_lock(&em_tree->lock);
594 em = lookup_extent_mapping(em_tree, start, len);
595 read_unlock(&em_tree->lock);
596
597 if (!em) {
598 /* get the big lock and read metadata off disk */
599 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
600 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
601 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
602
603 if (IS_ERR(em))
604 return 0;
605 }
606
607 /* this will cover holes, and inline extents */
608 if (em->block_start >= EXTENT_MAP_LAST_BYTE)
609 ret = 0;
610
611 /*
612 * we hit a real extent, if it is big don't bother defragging it again
613 */
614 if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
615 ret = 0;
616
617 /*
618 * last_len ends up being a counter of how many bytes we've defragged.
619 * every time we choose not to defrag an extent, we reset *last_len
620 * so that the next tiny extent will force a defrag.
621 *
622 * The end result of this is that tiny extents before a single big
623 * extent will force at least part of that big extent to be defragged.
624 */
625 if (ret) {
626 *last_len += len;
627 *defrag_end = extent_map_end(em);
628 } else {
629 *last_len = 0;
630 *skip = extent_map_end(em);
631 *defrag_end = 0;
632 }
633
634 free_extent_map(em);
635 return ret;
636}
637
638static int btrfs_defrag_file(struct file *file,
639 struct btrfs_ioctl_defrag_range_args *range)
478{ 640{
479 struct inode *inode = fdentry(file)->d_inode; 641 struct inode *inode = fdentry(file)->d_inode;
480 struct btrfs_root *root = BTRFS_I(inode)->root; 642 struct btrfs_root *root = BTRFS_I(inode)->root;
481 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 643 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
482 struct btrfs_ordered_extent *ordered; 644 struct btrfs_ordered_extent *ordered;
483 struct page *page; 645 struct page *page;
646 struct btrfs_super_block *disk_super;
484 unsigned long last_index; 647 unsigned long last_index;
485 unsigned long ra_pages = root->fs_info->bdi.ra_pages; 648 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
486 unsigned long total_read = 0; 649 unsigned long total_read = 0;
650 u64 features;
487 u64 page_start; 651 u64 page_start;
488 u64 page_end; 652 u64 page_end;
653 u64 last_len = 0;
654 u64 skip = 0;
655 u64 defrag_end = 0;
489 unsigned long i; 656 unsigned long i;
490 int ret; 657 int ret;
658 int compress_type = BTRFS_COMPRESS_ZLIB;
491 659
492 ret = btrfs_check_data_free_space(root, inode, inode->i_size); 660 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
493 if (ret) 661 if (range->compress_type > BTRFS_COMPRESS_TYPES)
494 return -ENOSPC; 662 return -EINVAL;
663 if (range->compress_type)
664 compress_type = range->compress_type;
665 }
666
667 if (inode->i_size == 0)
668 return 0;
669
670 if (range->start + range->len > range->start) {
671 last_index = min_t(u64, inode->i_size - 1,
672 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
673 } else {
674 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
675 }
676
677 i = range->start >> PAGE_CACHE_SHIFT;
678 while (i <= last_index) {
679 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
680 PAGE_CACHE_SIZE,
681 range->extent_thresh,
682 &last_len, &skip,
683 &defrag_end)) {
684 unsigned long next;
685 /*
686 * the should_defrag function tells us how much to skip
687 * bump our counter by the suggested amount
688 */
689 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
690 i = max(i + 1, next);
691 continue;
692 }
495 693
496 mutex_lock(&inode->i_mutex);
497 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
498 for (i = 0; i <= last_index; i++) {
499 if (total_read % ra_pages == 0) { 694 if (total_read % ra_pages == 0) {
500 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 695 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
501 min(last_index, i + ra_pages - 1)); 696 min(last_index, i + ra_pages - 1));
502 } 697 }
503 total_read++; 698 total_read++;
699 mutex_lock(&inode->i_mutex);
700 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
701 BTRFS_I(inode)->force_compress = compress_type;
702
703 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
704 if (ret)
705 goto err_unlock;
504again: 706again:
707 if (inode->i_size == 0 ||
708 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
709 ret = 0;
710 goto err_reservations;
711 }
712
505 page = grab_cache_page(inode->i_mapping, i); 713 page = grab_cache_page(inode->i_mapping, i);
506 if (!page) 714 if (!page) {
507 goto out_unlock; 715 ret = -ENOMEM;
716 goto err_reservations;
717 }
718
508 if (!PageUptodate(page)) { 719 if (!PageUptodate(page)) {
509 btrfs_readpage(NULL, page); 720 btrfs_readpage(NULL, page);
510 lock_page(page); 721 lock_page(page);
511 if (!PageUptodate(page)) { 722 if (!PageUptodate(page)) {
512 unlock_page(page); 723 unlock_page(page);
513 page_cache_release(page); 724 page_cache_release(page);
514 goto out_unlock; 725 ret = -EIO;
726 goto err_reservations;
515 } 727 }
516 } 728 }
517 729
730 if (page->mapping != inode->i_mapping) {
731 unlock_page(page);
732 page_cache_release(page);
733 goto again;
734 }
735
518 wait_on_page_writeback(page); 736 wait_on_page_writeback(page);
519 737
738 if (PageDirty(page)) {
739 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
740 goto loop_unlock;
741 }
742
520 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 743 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
521 page_end = page_start + PAGE_CACHE_SIZE - 1; 744 page_end = page_start + PAGE_CACHE_SIZE - 1;
522 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 745 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +760,60 @@ again:
537 * page if it is dirtied again later 760 * page if it is dirtied again later
538 */ 761 */
539 clear_page_dirty_for_io(page); 762 clear_page_dirty_for_io(page);
763 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
764 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
765 EXTENT_DO_ACCOUNTING, GFP_NOFS);
540 766
541 btrfs_set_extent_delalloc(inode, page_start, page_end); 767 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
768 ClearPageChecked(page);
542 set_page_dirty(page); 769 set_page_dirty(page);
543 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 770 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
771
772loop_unlock:
544 unlock_page(page); 773 unlock_page(page);
545 page_cache_release(page); 774 page_cache_release(page);
775 mutex_unlock(&inode->i_mutex);
776
546 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 777 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
778 i++;
779 }
780
781 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
782 filemap_flush(inode->i_mapping);
783
784 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
785 /* the filemap_flush will queue IO into the worker threads, but
786 * we have to make sure the IO is actually started and that
787 * ordered extents get created before we return
788 */
789 atomic_inc(&root->fs_info->async_submit_draining);
790 while (atomic_read(&root->fs_info->nr_async_submits) ||
791 atomic_read(&root->fs_info->async_delalloc_pages)) {
792 wait_event(root->fs_info->async_submit_wait,
793 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
794 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
795 }
796 atomic_dec(&root->fs_info->async_submit_draining);
797
798 mutex_lock(&inode->i_mutex);
799 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
800 mutex_unlock(&inode->i_mutex);
801 }
802
803 disk_super = &root->fs_info->super_copy;
804 features = btrfs_super_incompat_flags(disk_super);
805 if (range->compress_type == BTRFS_COMPRESS_LZO) {
806 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
807 btrfs_set_super_incompat_flags(disk_super, features);
547 } 808 }
548 809
549out_unlock:
550 mutex_unlock(&inode->i_mutex);
551 return 0; 810 return 0;
811
812err_reservations:
813 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
814err_unlock:
815 mutex_unlock(&inode->i_mutex);
816 return ret;
552} 817}
553 818
554static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 819static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -563,7 +828,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
563 char *sizestr; 828 char *sizestr;
564 char *devstr = NULL; 829 char *devstr = NULL;
565 int ret = 0; 830 int ret = 0;
566 int namelen;
567 int mod = 0; 831 int mod = 0;
568 832
569 if (root->fs_info->sb->s_flags & MS_RDONLY) 833 if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -577,7 +841,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
577 return PTR_ERR(vol_args); 841 return PTR_ERR(vol_args);
578 842
579 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 843 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
580 namelen = strlen(vol_args->name);
581 844
582 mutex_lock(&root->fs_info->volume_mutex); 845 mutex_lock(&root->fs_info->volume_mutex);
583 sizestr = vol_args->name; 846 sizestr = vol_args->name;
@@ -608,7 +871,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
608 mod = 1; 871 mod = 1;
609 sizestr++; 872 sizestr++;
610 } 873 }
611 new_size = btrfs_parse_size(sizestr); 874 new_size = memparse(sizestr, NULL);
612 if (new_size == 0) { 875 if (new_size == 0) {
613 ret = -EINVAL; 876 ret = -EINVAL;
614 goto out_unlock; 877 goto out_unlock;
@@ -643,7 +906,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
643 device->name, (unsigned long long)new_size); 906 device->name, (unsigned long long)new_size);
644 907
645 if (new_size > old_size) { 908 if (new_size > old_size) {
646 trans = btrfs_start_transaction(root, 1); 909 trans = btrfs_start_transaction(root, 0);
647 ret = btrfs_grow_device(trans, device, new_size); 910 ret = btrfs_grow_device(trans, device, new_size);
648 btrfs_commit_transaction(trans, root); 911 btrfs_commit_transaction(trans, root);
649 } else { 912 } else {
@@ -656,11 +919,14 @@ out_unlock:
656 return ret; 919 return ret;
657} 920}
658 921
659static noinline int btrfs_ioctl_snap_create(struct file *file, 922static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
660 void __user *arg, int subvol) 923 char *name,
924 unsigned long fd,
925 int subvol,
926 u64 *transid,
927 bool readonly)
661{ 928{
662 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 929 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
663 struct btrfs_ioctl_vol_args *vol_args;
664 struct file *src_file; 930 struct file *src_file;
665 int namelen; 931 int namelen;
666 int ret = 0; 932 int ret = 0;
@@ -668,23 +934,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
668 if (root->fs_info->sb->s_flags & MS_RDONLY) 934 if (root->fs_info->sb->s_flags & MS_RDONLY)
669 return -EROFS; 935 return -EROFS;
670 936
671 vol_args = memdup_user(arg, sizeof(*vol_args)); 937 namelen = strlen(name);
672 if (IS_ERR(vol_args)) 938 if (strchr(name, '/')) {
673 return PTR_ERR(vol_args);
674
675 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
676 namelen = strlen(vol_args->name);
677 if (strchr(vol_args->name, '/')) {
678 ret = -EINVAL; 939 ret = -EINVAL;
679 goto out; 940 goto out;
680 } 941 }
681 942
682 if (subvol) { 943 if (subvol) {
683 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 944 ret = btrfs_mksubvol(&file->f_path, name, namelen,
684 NULL); 945 NULL, transid, readonly);
685 } else { 946 } else {
686 struct inode *src_inode; 947 struct inode *src_inode;
687 src_file = fget(vol_args->fd); 948 src_file = fget(fd);
688 if (!src_file) { 949 if (!src_file) {
689 ret = -EINVAL; 950 ret = -EINVAL;
690 goto out; 951 goto out;
@@ -698,15 +959,152 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
698 fput(src_file); 959 fput(src_file);
699 goto out; 960 goto out;
700 } 961 }
701 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 962 ret = btrfs_mksubvol(&file->f_path, name, namelen,
702 BTRFS_I(src_inode)->root); 963 BTRFS_I(src_inode)->root,
964 transid, readonly);
703 fput(src_file); 965 fput(src_file);
704 } 966 }
705out: 967out:
968 return ret;
969}
970
971static noinline int btrfs_ioctl_snap_create(struct file *file,
972 void __user *arg, int subvol)
973{
974 struct btrfs_ioctl_vol_args *vol_args;
975 int ret;
976
977 vol_args = memdup_user(arg, sizeof(*vol_args));
978 if (IS_ERR(vol_args))
979 return PTR_ERR(vol_args);
980 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
981
982 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
983 vol_args->fd, subvol,
984 NULL, false);
985
706 kfree(vol_args); 986 kfree(vol_args);
707 return ret; 987 return ret;
708} 988}
709 989
990static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
991 void __user *arg, int subvol)
992{
993 struct btrfs_ioctl_vol_args_v2 *vol_args;
994 int ret;
995 u64 transid = 0;
996 u64 *ptr = NULL;
997 bool readonly = false;
998
999 vol_args = memdup_user(arg, sizeof(*vol_args));
1000 if (IS_ERR(vol_args))
1001 return PTR_ERR(vol_args);
1002 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1003
1004 if (vol_args->flags &
1005 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
1006 ret = -EOPNOTSUPP;
1007 goto out;
1008 }
1009
1010 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1011 ptr = &transid;
1012 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1013 readonly = true;
1014
1015 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1016 vol_args->fd, subvol,
1017 ptr, readonly);
1018
1019 if (ret == 0 && ptr &&
1020 copy_to_user(arg +
1021 offsetof(struct btrfs_ioctl_vol_args_v2,
1022 transid), ptr, sizeof(*ptr)))
1023 ret = -EFAULT;
1024out:
1025 kfree(vol_args);
1026 return ret;
1027}
1028
1029static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1030 void __user *arg)
1031{
1032 struct inode *inode = fdentry(file)->d_inode;
1033 struct btrfs_root *root = BTRFS_I(inode)->root;
1034 int ret = 0;
1035 u64 flags = 0;
1036
1037 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1038 return -EINVAL;
1039
1040 down_read(&root->fs_info->subvol_sem);
1041 if (btrfs_root_readonly(root))
1042 flags |= BTRFS_SUBVOL_RDONLY;
1043 up_read(&root->fs_info->subvol_sem);
1044
1045 if (copy_to_user(arg, &flags, sizeof(flags)))
1046 ret = -EFAULT;
1047
1048 return ret;
1049}
1050
1051static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1052 void __user *arg)
1053{
1054 struct inode *inode = fdentry(file)->d_inode;
1055 struct btrfs_root *root = BTRFS_I(inode)->root;
1056 struct btrfs_trans_handle *trans;
1057 u64 root_flags;
1058 u64 flags;
1059 int ret = 0;
1060
1061 if (root->fs_info->sb->s_flags & MS_RDONLY)
1062 return -EROFS;
1063
1064 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID)
1065 return -EINVAL;
1066
1067 if (copy_from_user(&flags, arg, sizeof(flags)))
1068 return -EFAULT;
1069
1070 if (flags & ~BTRFS_SUBVOL_CREATE_ASYNC)
1071 return -EINVAL;
1072
1073 if (flags & ~BTRFS_SUBVOL_RDONLY)
1074 return -EOPNOTSUPP;
1075
1076 down_write(&root->fs_info->subvol_sem);
1077
1078 /* nothing to do */
1079 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1080 goto out;
1081
1082 root_flags = btrfs_root_flags(&root->root_item);
1083 if (flags & BTRFS_SUBVOL_RDONLY)
1084 btrfs_set_root_flags(&root->root_item,
1085 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1086 else
1087 btrfs_set_root_flags(&root->root_item,
1088 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1089
1090 trans = btrfs_start_transaction(root, 1);
1091 if (IS_ERR(trans)) {
1092 ret = PTR_ERR(trans);
1093 goto out_reset;
1094 }
1095
1096 ret = btrfs_update_root(trans, root,
1097 &root->root_key, &root->root_item);
1098
1099 btrfs_commit_transaction(trans, root);
1100out_reset:
1101 if (ret)
1102 btrfs_set_root_flags(&root->root_item, root_flags);
1103out:
1104 up_write(&root->fs_info->subvol_sem);
1105 return ret;
1106}
1107
710/* 1108/*
711 * helper to check if the subvolume references other subvolumes 1109 * helper to check if the subvolume references other subvolumes
712 */ 1110 */
@@ -743,6 +1141,322 @@ out:
743 return ret; 1141 return ret;
744} 1142}
745 1143
1144static noinline int key_in_sk(struct btrfs_key *key,
1145 struct btrfs_ioctl_search_key *sk)
1146{
1147 struct btrfs_key test;
1148 int ret;
1149
1150 test.objectid = sk->min_objectid;
1151 test.type = sk->min_type;
1152 test.offset = sk->min_offset;
1153
1154 ret = btrfs_comp_cpu_keys(key, &test);
1155 if (ret < 0)
1156 return 0;
1157
1158 test.objectid = sk->max_objectid;
1159 test.type = sk->max_type;
1160 test.offset = sk->max_offset;
1161
1162 ret = btrfs_comp_cpu_keys(key, &test);
1163 if (ret > 0)
1164 return 0;
1165 return 1;
1166}
1167
1168static noinline int copy_to_sk(struct btrfs_root *root,
1169 struct btrfs_path *path,
1170 struct btrfs_key *key,
1171 struct btrfs_ioctl_search_key *sk,
1172 char *buf,
1173 unsigned long *sk_offset,
1174 int *num_found)
1175{
1176 u64 found_transid;
1177 struct extent_buffer *leaf;
1178 struct btrfs_ioctl_search_header sh;
1179 unsigned long item_off;
1180 unsigned long item_len;
1181 int nritems;
1182 int i;
1183 int slot;
1184 int found = 0;
1185 int ret = 0;
1186
1187 leaf = path->nodes[0];
1188 slot = path->slots[0];
1189 nritems = btrfs_header_nritems(leaf);
1190
1191 if (btrfs_header_generation(leaf) > sk->max_transid) {
1192 i = nritems;
1193 goto advance_key;
1194 }
1195 found_transid = btrfs_header_generation(leaf);
1196
1197 for (i = slot; i < nritems; i++) {
1198 item_off = btrfs_item_ptr_offset(leaf, i);
1199 item_len = btrfs_item_size_nr(leaf, i);
1200
1201 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
1202 item_len = 0;
1203
1204 if (sizeof(sh) + item_len + *sk_offset >
1205 BTRFS_SEARCH_ARGS_BUFSIZE) {
1206 ret = 1;
1207 goto overflow;
1208 }
1209
1210 btrfs_item_key_to_cpu(leaf, key, i);
1211 if (!key_in_sk(key, sk))
1212 continue;
1213
1214 sh.objectid = key->objectid;
1215 sh.offset = key->offset;
1216 sh.type = key->type;
1217 sh.len = item_len;
1218 sh.transid = found_transid;
1219
1220 /* copy search result header */
1221 memcpy(buf + *sk_offset, &sh, sizeof(sh));
1222 *sk_offset += sizeof(sh);
1223
1224 if (item_len) {
1225 char *p = buf + *sk_offset;
1226 /* copy the item */
1227 read_extent_buffer(leaf, p,
1228 item_off, item_len);
1229 *sk_offset += item_len;
1230 }
1231 found++;
1232
1233 if (*num_found >= sk->nr_items)
1234 break;
1235 }
1236advance_key:
1237 ret = 0;
1238 if (key->offset < (u64)-1 && key->offset < sk->max_offset)
1239 key->offset++;
1240 else if (key->type < (u8)-1 && key->type < sk->max_type) {
1241 key->offset = 0;
1242 key->type++;
1243 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
1244 key->offset = 0;
1245 key->type = 0;
1246 key->objectid++;
1247 } else
1248 ret = 1;
1249overflow:
1250 *num_found += found;
1251 return ret;
1252}
1253
1254static noinline int search_ioctl(struct inode *inode,
1255 struct btrfs_ioctl_search_args *args)
1256{
1257 struct btrfs_root *root;
1258 struct btrfs_key key;
1259 struct btrfs_key max_key;
1260 struct btrfs_path *path;
1261 struct btrfs_ioctl_search_key *sk = &args->key;
1262 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1263 int ret;
1264 int num_found = 0;
1265 unsigned long sk_offset = 0;
1266
1267 path = btrfs_alloc_path();
1268 if (!path)
1269 return -ENOMEM;
1270
1271 if (sk->tree_id == 0) {
1272 /* search the root of the inode that was passed */
1273 root = BTRFS_I(inode)->root;
1274 } else {
1275 key.objectid = sk->tree_id;
1276 key.type = BTRFS_ROOT_ITEM_KEY;
1277 key.offset = (u64)-1;
1278 root = btrfs_read_fs_root_no_name(info, &key);
1279 if (IS_ERR(root)) {
1280 printk(KERN_ERR "could not find root %llu\n",
1281 sk->tree_id);
1282 btrfs_free_path(path);
1283 return -ENOENT;
1284 }
1285 }
1286
1287 key.objectid = sk->min_objectid;
1288 key.type = sk->min_type;
1289 key.offset = sk->min_offset;
1290
1291 max_key.objectid = sk->max_objectid;
1292 max_key.type = sk->max_type;
1293 max_key.offset = sk->max_offset;
1294
1295 path->keep_locks = 1;
1296
1297 while(1) {
1298 ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1299 sk->min_transid);
1300 if (ret != 0) {
1301 if (ret > 0)
1302 ret = 0;
1303 goto err;
1304 }
1305 ret = copy_to_sk(root, path, &key, sk, args->buf,
1306 &sk_offset, &num_found);
1307 btrfs_release_path(root, path);
1308 if (ret || num_found >= sk->nr_items)
1309 break;
1310
1311 }
1312 ret = 0;
1313err:
1314 sk->nr_items = num_found;
1315 btrfs_free_path(path);
1316 return ret;
1317}
1318
1319static noinline int btrfs_ioctl_tree_search(struct file *file,
1320 void __user *argp)
1321{
1322 struct btrfs_ioctl_search_args *args;
1323 struct inode *inode;
1324 int ret;
1325
1326 if (!capable(CAP_SYS_ADMIN))
1327 return -EPERM;
1328
1329 args = memdup_user(argp, sizeof(*args));
1330 if (IS_ERR(args))
1331 return PTR_ERR(args);
1332
1333 inode = fdentry(file)->d_inode;
1334 ret = search_ioctl(inode, args);
1335 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1336 ret = -EFAULT;
1337 kfree(args);
1338 return ret;
1339}
1340
1341/*
1342 * Search INODE_REFs to identify path name of 'dirid' directory
1343 * in a 'tree_id' tree. and sets path name to 'name'.
1344 */
1345static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1346 u64 tree_id, u64 dirid, char *name)
1347{
1348 struct btrfs_root *root;
1349 struct btrfs_key key;
1350 char *ptr;
1351 int ret = -1;
1352 int slot;
1353 int len;
1354 int total_len = 0;
1355 struct btrfs_inode_ref *iref;
1356 struct extent_buffer *l;
1357 struct btrfs_path *path;
1358
1359 if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1360 name[0]='\0';
1361 return 0;
1362 }
1363
1364 path = btrfs_alloc_path();
1365 if (!path)
1366 return -ENOMEM;
1367
1368 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1369
1370 key.objectid = tree_id;
1371 key.type = BTRFS_ROOT_ITEM_KEY;
1372 key.offset = (u64)-1;
1373 root = btrfs_read_fs_root_no_name(info, &key);
1374 if (IS_ERR(root)) {
1375 printk(KERN_ERR "could not find root %llu\n", tree_id);
1376 ret = -ENOENT;
1377 goto out;
1378 }
1379
1380 key.objectid = dirid;
1381 key.type = BTRFS_INODE_REF_KEY;
1382 key.offset = (u64)-1;
1383
1384 while(1) {
1385 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1386 if (ret < 0)
1387 goto out;
1388
1389 l = path->nodes[0];
1390 slot = path->slots[0];
1391 if (ret > 0 && slot > 0)
1392 slot--;
1393 btrfs_item_key_to_cpu(l, &key, slot);
1394
1395 if (ret > 0 && (key.objectid != dirid ||
1396 key.type != BTRFS_INODE_REF_KEY)) {
1397 ret = -ENOENT;
1398 goto out;
1399 }
1400
1401 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1402 len = btrfs_inode_ref_name_len(l, iref);
1403 ptr -= len + 1;
1404 total_len += len + 1;
1405 if (ptr < name)
1406 goto out;
1407
1408 *(ptr + len) = '/';
1409 read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1410
1411 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1412 break;
1413
1414 btrfs_release_path(root, path);
1415 key.objectid = key.offset;
1416 key.offset = (u64)-1;
1417 dirid = key.objectid;
1418
1419 }
1420 if (ptr < name)
1421 goto out;
1422 memcpy(name, ptr, total_len);
1423 name[total_len]='\0';
1424 ret = 0;
1425out:
1426 btrfs_free_path(path);
1427 return ret;
1428}
1429
1430static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1431 void __user *argp)
1432{
1433 struct btrfs_ioctl_ino_lookup_args *args;
1434 struct inode *inode;
1435 int ret;
1436
1437 if (!capable(CAP_SYS_ADMIN))
1438 return -EPERM;
1439
1440 args = memdup_user(argp, sizeof(*args));
1441 if (IS_ERR(args))
1442 return PTR_ERR(args);
1443
1444 inode = fdentry(file)->d_inode;
1445
1446 if (args->treeid == 0)
1447 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
1448
1449 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1450 args->treeid, args->objectid,
1451 args->name);
1452
1453 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1454 ret = -EFAULT;
1455
1456 kfree(args);
1457 return ret;
1458}
1459
746static noinline int btrfs_ioctl_snap_destroy(struct file *file, 1460static noinline int btrfs_ioctl_snap_destroy(struct file *file,
747 void __user *arg) 1461 void __user *arg)
748{ 1462{
@@ -758,9 +1472,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
758 int ret; 1472 int ret;
759 int err = 0; 1473 int err = 0;
760 1474
761 if (!capable(CAP_SYS_ADMIN))
762 return -EPERM;
763
764 vol_args = memdup_user(arg, sizeof(*vol_args)); 1475 vol_args = memdup_user(arg, sizeof(*vol_args));
765 if (IS_ERR(vol_args)) 1476 if (IS_ERR(vol_args))
766 return PTR_ERR(vol_args); 1477 return PTR_ERR(vol_args);
@@ -790,13 +1501,51 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
790 } 1501 }
791 1502
792 inode = dentry->d_inode; 1503 inode = dentry->d_inode;
1504 dest = BTRFS_I(inode)->root;
1505 if (!capable(CAP_SYS_ADMIN)){
1506 /*
1507 * Regular user. Only allow this with a special mount
1508 * option, when the user has write+exec access to the
1509 * subvol root, and when rmdir(2) would have been
1510 * allowed.
1511 *
1512 * Note that this is _not_ check that the subvol is
1513 * empty or doesn't contain data that we wouldn't
1514 * otherwise be able to delete.
1515 *
1516 * Users who want to delete empty subvols should try
1517 * rmdir(2).
1518 */
1519 err = -EPERM;
1520 if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
1521 goto out_dput;
1522
1523 /*
1524 * Do not allow deletion if the parent dir is the same
1525 * as the dir to be deleted. That means the ioctl
1526 * must be called on the dentry referencing the root
1527 * of the subvol, not a random directory contained
1528 * within it.
1529 */
1530 err = -EINVAL;
1531 if (root == dest)
1532 goto out_dput;
1533
1534 err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
1535 if (err)
1536 goto out_dput;
1537
1538 /* check if subvolume may be deleted by a non-root user */
1539 err = btrfs_may_delete(dir, dentry, 1);
1540 if (err)
1541 goto out_dput;
1542 }
1543
793 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 1544 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
794 err = -EINVAL; 1545 err = -EINVAL;
795 goto out_dput; 1546 goto out_dput;
796 } 1547 }
797 1548
798 dest = BTRFS_I(inode)->root;
799
800 mutex_lock(&inode->i_mutex); 1549 mutex_lock(&inode->i_mutex);
801 err = d_invalidate(dentry); 1550 err = d_invalidate(dentry);
802 if (err) 1551 if (err)
@@ -808,7 +1557,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
808 if (err) 1557 if (err)
809 goto out_up_write; 1558 goto out_up_write;
810 1559
811 trans = btrfs_start_transaction(root, 1); 1560 trans = btrfs_start_transaction(root, 0);
1561 if (IS_ERR(trans)) {
1562 err = PTR_ERR(trans);
1563 goto out_up_write;
1564 }
1565 trans->block_rsv = &root->fs_info->global_block_rsv;
1566
812 ret = btrfs_unlink_subvol(trans, root, dir, 1567 ret = btrfs_unlink_subvol(trans, root, dir,
813 dest->root_key.objectid, 1568 dest->root_key.objectid,
814 dentry->d_name.name, 1569 dentry->d_name.name,
@@ -822,12 +1577,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
822 dest->root_item.drop_level = 0; 1577 dest->root_item.drop_level = 0;
823 btrfs_set_root_refs(&dest->root_item, 0); 1578 btrfs_set_root_refs(&dest->root_item, 0);
824 1579
825 ret = btrfs_insert_orphan_item(trans, 1580 if (!xchg(&dest->orphan_item_inserted, 1)) {
826 root->fs_info->tree_root, 1581 ret = btrfs_insert_orphan_item(trans,
827 dest->root_key.objectid); 1582 root->fs_info->tree_root,
828 BUG_ON(ret); 1583 dest->root_key.objectid);
1584 BUG_ON(ret);
1585 }
829 1586
830 ret = btrfs_commit_transaction(trans, root); 1587 ret = btrfs_end_transaction(trans, root);
831 BUG_ON(ret); 1588 BUG_ON(ret);
832 inode->i_flags |= S_DEAD; 1589 inode->i_flags |= S_DEAD;
833out_up_write: 1590out_up_write:
@@ -849,12 +1606,16 @@ out:
849 return err; 1606 return err;
850} 1607}
851 1608
852static int btrfs_ioctl_defrag(struct file *file) 1609static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
853{ 1610{
854 struct inode *inode = fdentry(file)->d_inode; 1611 struct inode *inode = fdentry(file)->d_inode;
855 struct btrfs_root *root = BTRFS_I(inode)->root; 1612 struct btrfs_root *root = BTRFS_I(inode)->root;
1613 struct btrfs_ioctl_defrag_range_args *range;
856 int ret; 1614 int ret;
857 1615
1616 if (btrfs_root_readonly(root))
1617 return -EROFS;
1618
858 ret = mnt_want_write(file->f_path.mnt); 1619 ret = mnt_want_write(file->f_path.mnt);
859 if (ret) 1620 if (ret)
860 return ret; 1621 return ret;
@@ -865,16 +1626,44 @@ static int btrfs_ioctl_defrag(struct file *file)
865 ret = -EPERM; 1626 ret = -EPERM;
866 goto out; 1627 goto out;
867 } 1628 }
868 btrfs_defrag_root(root, 0); 1629 ret = btrfs_defrag_root(root, 0);
869 btrfs_defrag_root(root->fs_info->extent_root, 0); 1630 if (ret)
1631 goto out;
1632 ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
870 break; 1633 break;
871 case S_IFREG: 1634 case S_IFREG:
872 if (!(file->f_mode & FMODE_WRITE)) { 1635 if (!(file->f_mode & FMODE_WRITE)) {
873 ret = -EINVAL; 1636 ret = -EINVAL;
874 goto out; 1637 goto out;
875 } 1638 }
876 btrfs_defrag_file(file); 1639
1640 range = kzalloc(sizeof(*range), GFP_KERNEL);
1641 if (!range) {
1642 ret = -ENOMEM;
1643 goto out;
1644 }
1645
1646 if (argp) {
1647 if (copy_from_user(range, argp,
1648 sizeof(*range))) {
1649 ret = -EFAULT;
1650 kfree(range);
1651 goto out;
1652 }
1653 /* compression requires us to start the IO */
1654 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1655 range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
1656 range->extent_thresh = (u32)-1;
1657 }
1658 } else {
1659 /* the rest are all set to zero by kzalloc */
1660 range->len = (u64)-1;
1661 }
1662 ret = btrfs_defrag_file(file, range);
1663 kfree(range);
877 break; 1664 break;
1665 default:
1666 ret = -EINVAL;
878 } 1667 }
879out: 1668out:
880 mnt_drop_write(file->f_path.mnt); 1669 mnt_drop_write(file->f_path.mnt);
@@ -952,9 +1741,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
952 */ 1741 */
953 1742
954 /* the destination must be opened for writing */ 1743 /* the destination must be opened for writing */
955 if (!(file->f_mode & FMODE_WRITE)) 1744 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
956 return -EINVAL; 1745 return -EINVAL;
957 1746
1747 if (btrfs_root_readonly(root))
1748 return -EROFS;
1749
958 ret = mnt_want_write(file->f_path.mnt); 1750 ret = mnt_want_write(file->f_path.mnt);
959 if (ret) 1751 if (ret)
960 return ret; 1752 return ret;
@@ -964,12 +1756,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
964 ret = -EBADF; 1756 ret = -EBADF;
965 goto out_drop_write; 1757 goto out_drop_write;
966 } 1758 }
1759
967 src = src_file->f_dentry->d_inode; 1760 src = src_file->f_dentry->d_inode;
968 1761
969 ret = -EINVAL; 1762 ret = -EINVAL;
970 if (src == inode) 1763 if (src == inode)
971 goto out_fput; 1764 goto out_fput;
972 1765
1766 /* the src must be open for reading */
1767 if (!(src_file->f_mode & FMODE_READ))
1768 goto out_fput;
1769
973 ret = -EISDIR; 1770 ret = -EISDIR;
974 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) 1771 if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
975 goto out_fput; 1772 goto out_fput;
@@ -991,27 +1788,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
991 path->reada = 2; 1788 path->reada = 2;
992 1789
993 if (inode < src) { 1790 if (inode < src) {
994 mutex_lock(&inode->i_mutex); 1791 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
995 mutex_lock(&src->i_mutex); 1792 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
996 } else { 1793 } else {
997 mutex_lock(&src->i_mutex); 1794 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
998 mutex_lock(&inode->i_mutex); 1795 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
999 } 1796 }
1000 1797
1001 /* determine range to clone */ 1798 /* determine range to clone */
1002 ret = -EINVAL; 1799 ret = -EINVAL;
1003 if (off >= src->i_size || off + len > src->i_size) 1800 if (off + len > src->i_size || off + len < off)
1004 goto out_unlock; 1801 goto out_unlock;
1005 if (len == 0) 1802 if (len == 0)
1006 olen = len = src->i_size - off; 1803 olen = len = src->i_size - off;
1007 /* if we extend to eof, continue to block boundary */ 1804 /* if we extend to eof, continue to block boundary */
1008 if (off + len == src->i_size) 1805 if (off + len == src->i_size)
1009 len = ((src->i_size + bs-1) & ~(bs-1)) 1806 len = ALIGN(src->i_size, bs) - off;
1010 - off;
1011 1807
1012 /* verify the end result is block aligned */ 1808 /* verify the end result is block aligned */
1013 if ((off & (bs-1)) || 1809 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
1014 ((off + len) & (bs-1))) 1810 !IS_ALIGNED(destoff, bs))
1015 goto out_unlock; 1811 goto out_unlock;
1016 1812
1017 /* do any pending delalloc/csum calc on src, one way or 1813 /* do any pending delalloc/csum calc on src, one way or
@@ -1019,21 +1815,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1019 while (1) { 1815 while (1) {
1020 struct btrfs_ordered_extent *ordered; 1816 struct btrfs_ordered_extent *ordered;
1021 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1817 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1022 ordered = btrfs_lookup_first_ordered_extent(inode, off+len); 1818 ordered = btrfs_lookup_first_ordered_extent(src, off+len);
1023 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) 1819 if (!ordered &&
1820 !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
1821 EXTENT_DELALLOC, 0, NULL))
1024 break; 1822 break;
1025 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 1823 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1026 if (ordered) 1824 if (ordered)
1027 btrfs_put_ordered_extent(ordered); 1825 btrfs_put_ordered_extent(ordered);
1028 btrfs_wait_ordered_range(src, off, off+len); 1826 btrfs_wait_ordered_range(src, off, len);
1029 } 1827 }
1030 1828
1031 trans = btrfs_start_transaction(root, 1);
1032 BUG_ON(!trans);
1033
1034 /* punch hole in destination first */
1035 btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
1036
1037 /* clone data */ 1829 /* clone data */
1038 key.objectid = src->i_ino; 1830 key.objectid = src->i_ino;
1039 key.type = BTRFS_EXTENT_DATA_KEY; 1831 key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1044,7 +1836,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1044 * note the key will change type as we walk through the 1836 * note the key will change type as we walk through the
1045 * tree. 1837 * tree.
1046 */ 1838 */
1047 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 1839 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1048 if (ret < 0) 1840 if (ret < 0)
1049 goto out; 1841 goto out;
1050 1842
@@ -1073,6 +1865,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1073 u64 disko = 0, diskl = 0; 1865 u64 disko = 0, diskl = 0;
1074 u64 datao = 0, datal = 0; 1866 u64 datao = 0, datal = 0;
1075 u8 comp; 1867 u8 comp;
1868 u64 endoff;
1076 1869
1077 size = btrfs_item_size_nr(leaf, slot); 1870 size = btrfs_item_size_nr(leaf, slot);
1078 read_extent_buffer(leaf, buf, 1871 read_extent_buffer(leaf, buf,
@@ -1099,7 +1892,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1099 } 1892 }
1100 btrfs_release_path(root, path); 1893 btrfs_release_path(root, path);
1101 1894
1102 if (key.offset + datal < off || 1895 if (key.offset + datal <= off ||
1103 key.offset >= off+len) 1896 key.offset >= off+len)
1104 goto next; 1897 goto next;
1105 1898
@@ -1107,12 +1900,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1107 new_key.objectid = inode->i_ino; 1900 new_key.objectid = inode->i_ino;
1108 new_key.offset = key.offset + destoff - off; 1901 new_key.offset = key.offset + destoff - off;
1109 1902
1903 trans = btrfs_start_transaction(root, 1);
1904 if (IS_ERR(trans)) {
1905 ret = PTR_ERR(trans);
1906 goto out;
1907 }
1908
1110 if (type == BTRFS_FILE_EXTENT_REG || 1909 if (type == BTRFS_FILE_EXTENT_REG ||
1111 type == BTRFS_FILE_EXTENT_PREALLOC) { 1910 type == BTRFS_FILE_EXTENT_PREALLOC) {
1911 if (off > key.offset) {
1912 datao += off - key.offset;
1913 datal -= off - key.offset;
1914 }
1915
1916 if (key.offset + datal > off + len)
1917 datal = off + len - key.offset;
1918
1919 ret = btrfs_drop_extents(trans, inode,
1920 new_key.offset,
1921 new_key.offset + datal,
1922 &hint_byte, 1);
1923 BUG_ON(ret);
1924
1112 ret = btrfs_insert_empty_item(trans, root, path, 1925 ret = btrfs_insert_empty_item(trans, root, path,
1113 &new_key, size); 1926 &new_key, size);
1114 if (ret) 1927 BUG_ON(ret);
1115 goto out;
1116 1928
1117 leaf = path->nodes[0]; 1929 leaf = path->nodes[0];
1118 slot = path->slots[0]; 1930 slot = path->slots[0];
@@ -1123,14 +1935,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1123 extent = btrfs_item_ptr(leaf, slot, 1935 extent = btrfs_item_ptr(leaf, slot,
1124 struct btrfs_file_extent_item); 1936 struct btrfs_file_extent_item);
1125 1937
1126 if (off > key.offset) {
1127 datao += off - key.offset;
1128 datal -= off - key.offset;
1129 }
1130
1131 if (key.offset + datal > off + len)
1132 datal = off + len - key.offset;
1133
1134 /* disko == 0 means it's a hole */ 1938 /* disko == 0 means it's a hole */
1135 if (!disko) 1939 if (!disko)
1136 datao = 0; 1940 datao = 0;
@@ -1161,14 +1965,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1161 1965
1162 if (comp && (skip || trim)) { 1966 if (comp && (skip || trim)) {
1163 ret = -EINVAL; 1967 ret = -EINVAL;
1968 btrfs_end_transaction(trans, root);
1164 goto out; 1969 goto out;
1165 } 1970 }
1166 size -= skip + trim; 1971 size -= skip + trim;
1167 datal -= skip + trim; 1972 datal -= skip + trim;
1973
1974 ret = btrfs_drop_extents(trans, inode,
1975 new_key.offset,
1976 new_key.offset + datal,
1977 &hint_byte, 1);
1978 BUG_ON(ret);
1979
1168 ret = btrfs_insert_empty_item(trans, root, path, 1980 ret = btrfs_insert_empty_item(trans, root, path,
1169 &new_key, size); 1981 &new_key, size);
1170 if (ret) 1982 BUG_ON(ret);
1171 goto out;
1172 1983
1173 if (skip) { 1984 if (skip) {
1174 u32 start = 1985 u32 start =
@@ -1186,8 +1997,26 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1186 } 1997 }
1187 1998
1188 btrfs_mark_buffer_dirty(leaf); 1999 btrfs_mark_buffer_dirty(leaf);
1189 } 2000 btrfs_release_path(root, path);
1190 2001
2002 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
2003
2004 /*
2005 * we round up to the block size at eof when
2006 * determining which extents to clone above,
2007 * but shouldn't round up the file size
2008 */
2009 endoff = new_key.offset + datal;
2010 if (endoff > destoff+olen)
2011 endoff = destoff+olen;
2012 if (endoff > inode->i_size)
2013 btrfs_i_size_write(inode, endoff);
2014
2015 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
2016 ret = btrfs_update_inode(trans, root, inode);
2017 BUG_ON(ret);
2018 btrfs_end_transaction(trans, root);
2019 }
1191next: 2020next:
1192 btrfs_release_path(root, path); 2021 btrfs_release_path(root, path);
1193 key.offset++; 2022 key.offset++;
@@ -1195,17 +2024,7 @@ next:
1195 ret = 0; 2024 ret = 0;
1196out: 2025out:
1197 btrfs_release_path(root, path); 2026 btrfs_release_path(root, path);
1198 if (ret == 0) {
1199 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1200 if (destoff + olen > inode->i_size)
1201 btrfs_i_size_write(inode, destoff + olen);
1202 BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
1203 ret = btrfs_update_inode(trans, root, inode);
1204 }
1205 btrfs_end_transaction(trans, root);
1206 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 2027 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1207 if (ret)
1208 vmtruncate(inode, 0);
1209out_unlock: 2028out_unlock:
1210 mutex_unlock(&src->i_mutex); 2029 mutex_unlock(&src->i_mutex);
1211 mutex_unlock(&inode->i_mutex); 2030 mutex_unlock(&inode->i_mutex);
@@ -1249,6 +2068,10 @@ static long btrfs_ioctl_trans_start(struct file *file)
1249 if (file->private_data) 2068 if (file->private_data)
1250 goto out; 2069 goto out;
1251 2070
2071 ret = -EROFS;
2072 if (btrfs_root_readonly(root))
2073 goto out;
2074
1252 ret = mnt_want_write(file->f_path.mnt); 2075 ret = mnt_want_write(file->f_path.mnt);
1253 if (ret) 2076 if (ret)
1254 goto out; 2077 goto out;
@@ -1274,6 +2097,209 @@ out:
1274 return ret; 2097 return ret;
1275} 2098}
1276 2099
2100static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2101{
2102 struct inode *inode = fdentry(file)->d_inode;
2103 struct btrfs_root *root = BTRFS_I(inode)->root;
2104 struct btrfs_root *new_root;
2105 struct btrfs_dir_item *di;
2106 struct btrfs_trans_handle *trans;
2107 struct btrfs_path *path;
2108 struct btrfs_key location;
2109 struct btrfs_disk_key disk_key;
2110 struct btrfs_super_block *disk_super;
2111 u64 features;
2112 u64 objectid = 0;
2113 u64 dir_id;
2114
2115 if (!capable(CAP_SYS_ADMIN))
2116 return -EPERM;
2117
2118 if (copy_from_user(&objectid, argp, sizeof(objectid)))
2119 return -EFAULT;
2120
2121 if (!objectid)
2122 objectid = root->root_key.objectid;
2123
2124 location.objectid = objectid;
2125 location.type = BTRFS_ROOT_ITEM_KEY;
2126 location.offset = (u64)-1;
2127
2128 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
2129 if (IS_ERR(new_root))
2130 return PTR_ERR(new_root);
2131
2132 if (btrfs_root_refs(&new_root->root_item) == 0)
2133 return -ENOENT;
2134
2135 path = btrfs_alloc_path();
2136 if (!path)
2137 return -ENOMEM;
2138 path->leave_spinning = 1;
2139
2140 trans = btrfs_start_transaction(root, 1);
2141 if (!trans) {
2142 btrfs_free_path(path);
2143 return -ENOMEM;
2144 }
2145
2146 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
2147 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2148 dir_id, "default", 7, 1);
2149 if (IS_ERR_OR_NULL(di)) {
2150 btrfs_free_path(path);
2151 btrfs_end_transaction(trans, root);
2152 printk(KERN_ERR "Umm, you don't have the default dir item, "
2153 "this isn't going to work\n");
2154 return -ENOENT;
2155 }
2156
2157 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
2158 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
2159 btrfs_mark_buffer_dirty(path->nodes[0]);
2160 btrfs_free_path(path);
2161
2162 disk_super = &root->fs_info->super_copy;
2163 features = btrfs_super_incompat_flags(disk_super);
2164 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2165 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
2166 btrfs_set_super_incompat_flags(disk_super, features);
2167 }
2168 btrfs_end_transaction(trans, root);
2169
2170 return 0;
2171}
2172
2173static void get_block_group_info(struct list_head *groups_list,
2174 struct btrfs_ioctl_space_info *space)
2175{
2176 struct btrfs_block_group_cache *block_group;
2177
2178 space->total_bytes = 0;
2179 space->used_bytes = 0;
2180 space->flags = 0;
2181 list_for_each_entry(block_group, groups_list, list) {
2182 space->flags = block_group->flags;
2183 space->total_bytes += block_group->key.offset;
2184 space->used_bytes +=
2185 btrfs_block_group_used(&block_group->item);
2186 }
2187}
2188
2189long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
2190{
2191 struct btrfs_ioctl_space_args space_args;
2192 struct btrfs_ioctl_space_info space;
2193 struct btrfs_ioctl_space_info *dest;
2194 struct btrfs_ioctl_space_info *dest_orig;
2195 struct btrfs_ioctl_space_info *user_dest;
2196 struct btrfs_space_info *info;
2197 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2198 BTRFS_BLOCK_GROUP_SYSTEM,
2199 BTRFS_BLOCK_GROUP_METADATA,
2200 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2201 int num_types = 4;
2202 int alloc_size;
2203 int ret = 0;
2204 int slot_count = 0;
2205 int i, c;
2206
2207 if (copy_from_user(&space_args,
2208 (struct btrfs_ioctl_space_args __user *)arg,
2209 sizeof(space_args)))
2210 return -EFAULT;
2211
2212 for (i = 0; i < num_types; i++) {
2213 struct btrfs_space_info *tmp;
2214
2215 info = NULL;
2216 rcu_read_lock();
2217 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
2218 list) {
2219 if (tmp->flags == types[i]) {
2220 info = tmp;
2221 break;
2222 }
2223 }
2224 rcu_read_unlock();
2225
2226 if (!info)
2227 continue;
2228
2229 down_read(&info->groups_sem);
2230 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2231 if (!list_empty(&info->block_groups[c]))
2232 slot_count++;
2233 }
2234 up_read(&info->groups_sem);
2235 }
2236
2237 /* space_slots == 0 means they are asking for a count */
2238 if (space_args.space_slots == 0) {
2239 space_args.total_spaces = slot_count;
2240 goto out;
2241 }
2242
2243 slot_count = min_t(int, space_args.space_slots, slot_count);
2244
2245 alloc_size = sizeof(*dest) * slot_count;
2246
2247 /* we generally have at most 6 or so space infos, one for each raid
2248 * level. So, a whole page should be more than enough for everyone
2249 */
2250 if (alloc_size > PAGE_CACHE_SIZE)
2251 return -ENOMEM;
2252
2253 space_args.total_spaces = 0;
2254 dest = kmalloc(alloc_size, GFP_NOFS);
2255 if (!dest)
2256 return -ENOMEM;
2257 dest_orig = dest;
2258
2259 /* now we have a buffer to copy into */
2260 for (i = 0; i < num_types; i++) {
2261 struct btrfs_space_info *tmp;
2262
2263 info = NULL;
2264 rcu_read_lock();
2265 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
2266 list) {
2267 if (tmp->flags == types[i]) {
2268 info = tmp;
2269 break;
2270 }
2271 }
2272 rcu_read_unlock();
2273
2274 if (!info)
2275 continue;
2276 down_read(&info->groups_sem);
2277 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2278 if (!list_empty(&info->block_groups[c])) {
2279 get_block_group_info(&info->block_groups[c],
2280 &space);
2281 memcpy(dest, &space, sizeof(space));
2282 dest++;
2283 space_args.total_spaces++;
2284 }
2285 }
2286 up_read(&info->groups_sem);
2287 }
2288
2289 user_dest = (struct btrfs_ioctl_space_info *)
2290 (arg + sizeof(struct btrfs_ioctl_space_args));
2291
2292 if (copy_to_user(user_dest, dest_orig, alloc_size))
2293 ret = -EFAULT;
2294
2295 kfree(dest_orig);
2296out:
2297 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
2298 ret = -EFAULT;
2299
2300 return ret;
2301}
2302
1277/* 2303/*
1278 * there are many ways the trans_start and trans_end ioctls can lead 2304 * there are many ways the trans_start and trans_end ioctls can lead
1279 * to deadlocks. They should only be used by applications that 2305 * to deadlocks. They should only be used by applications that
@@ -1301,6 +2327,36 @@ long btrfs_ioctl_trans_end(struct file *file)
1301 return 0; 2327 return 0;
1302} 2328}
1303 2329
2330static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
2331{
2332 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2333 struct btrfs_trans_handle *trans;
2334 u64 transid;
2335
2336 trans = btrfs_start_transaction(root, 0);
2337 transid = trans->transid;
2338 btrfs_commit_transaction_async(trans, root, 0);
2339
2340 if (argp)
2341 if (copy_to_user(argp, &transid, sizeof(transid)))
2342 return -EFAULT;
2343 return 0;
2344}
2345
2346static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
2347{
2348 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2349 u64 transid;
2350
2351 if (argp) {
2352 if (copy_from_user(&transid, argp, sizeof(transid)))
2353 return -EFAULT;
2354 } else {
2355 transid = 0; /* current trans */
2356 }
2357 return btrfs_wait_for_commit(root, transid);
2358}
2359
1304long btrfs_ioctl(struct file *file, unsigned int 2360long btrfs_ioctl(struct file *file, unsigned int
1305 cmd, unsigned long arg) 2361 cmd, unsigned long arg)
1306{ 2362{
@@ -1316,12 +2372,22 @@ long btrfs_ioctl(struct file *file, unsigned int
1316 return btrfs_ioctl_getversion(file, argp); 2372 return btrfs_ioctl_getversion(file, argp);
1317 case BTRFS_IOC_SNAP_CREATE: 2373 case BTRFS_IOC_SNAP_CREATE:
1318 return btrfs_ioctl_snap_create(file, argp, 0); 2374 return btrfs_ioctl_snap_create(file, argp, 0);
2375 case BTRFS_IOC_SNAP_CREATE_V2:
2376 return btrfs_ioctl_snap_create_v2(file, argp, 0);
1319 case BTRFS_IOC_SUBVOL_CREATE: 2377 case BTRFS_IOC_SUBVOL_CREATE:
1320 return btrfs_ioctl_snap_create(file, argp, 1); 2378 return btrfs_ioctl_snap_create(file, argp, 1);
1321 case BTRFS_IOC_SNAP_DESTROY: 2379 case BTRFS_IOC_SNAP_DESTROY:
1322 return btrfs_ioctl_snap_destroy(file, argp); 2380 return btrfs_ioctl_snap_destroy(file, argp);
2381 case BTRFS_IOC_SUBVOL_GETFLAGS:
2382 return btrfs_ioctl_subvol_getflags(file, argp);
2383 case BTRFS_IOC_SUBVOL_SETFLAGS:
2384 return btrfs_ioctl_subvol_setflags(file, argp);
2385 case BTRFS_IOC_DEFAULT_SUBVOL:
2386 return btrfs_ioctl_default_subvol(file, argp);
1323 case BTRFS_IOC_DEFRAG: 2387 case BTRFS_IOC_DEFRAG:
1324 return btrfs_ioctl_defrag(file); 2388 return btrfs_ioctl_defrag(file, NULL);
2389 case BTRFS_IOC_DEFRAG_RANGE:
2390 return btrfs_ioctl_defrag(file, argp);
1325 case BTRFS_IOC_RESIZE: 2391 case BTRFS_IOC_RESIZE:
1326 return btrfs_ioctl_resize(root, argp); 2392 return btrfs_ioctl_resize(root, argp);
1327 case BTRFS_IOC_ADD_DEV: 2393 case BTRFS_IOC_ADD_DEV:
@@ -1338,9 +2404,19 @@ long btrfs_ioctl(struct file *file, unsigned int
1338 return btrfs_ioctl_trans_start(file); 2404 return btrfs_ioctl_trans_start(file);
1339 case BTRFS_IOC_TRANS_END: 2405 case BTRFS_IOC_TRANS_END:
1340 return btrfs_ioctl_trans_end(file); 2406 return btrfs_ioctl_trans_end(file);
2407 case BTRFS_IOC_TREE_SEARCH:
2408 return btrfs_ioctl_tree_search(file, argp);
2409 case BTRFS_IOC_INO_LOOKUP:
2410 return btrfs_ioctl_ino_lookup(file, argp);
2411 case BTRFS_IOC_SPACE_INFO:
2412 return btrfs_ioctl_space_info(root, argp);
1341 case BTRFS_IOC_SYNC: 2413 case BTRFS_IOC_SYNC:
1342 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2414 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1343 return 0; 2415 return 0;
2416 case BTRFS_IOC_START_SYNC:
2417 return btrfs_ioctl_start_sync(file, argp);
2418 case BTRFS_IOC_WAIT_SYNC:
2419 return btrfs_ioctl_wait_sync(file, argp);
1344 } 2420 }
1345 2421
1346 return -ENOTTY; 2422 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..8fb382167b13 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,20 +22,141 @@
22 22
23#define BTRFS_IOCTL_MAGIC 0x94 23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255 24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 4087
26 25
27/* this should be 4k */ 26/* this should be 4k */
27#define BTRFS_PATH_NAME_MAX 4087
28struct btrfs_ioctl_vol_args { 28struct btrfs_ioctl_vol_args {
29 __s64 fd; 29 __s64 fd;
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35
36#define BTRFS_SUBVOL_NAME_MAX 4039
37struct btrfs_ioctl_vol_args_v2 {
38 __s64 fd;
39 __u64 transid;
40 __u64 flags;
41 __u64 unused[4];
42 char name[BTRFS_SUBVOL_NAME_MAX + 1];
43};
44
45#define BTRFS_INO_LOOKUP_PATH_MAX 4080
46struct btrfs_ioctl_ino_lookup_args {
47 __u64 treeid;
48 __u64 objectid;
49 char name[BTRFS_INO_LOOKUP_PATH_MAX];
50};
51
52struct btrfs_ioctl_search_key {
53 /* which root are we searching. 0 is the tree of tree roots */
54 __u64 tree_id;
55
56 /* keys returned will be >= min and <= max */
57 __u64 min_objectid;
58 __u64 max_objectid;
59
60 /* keys returned will be >= min and <= max */
61 __u64 min_offset;
62 __u64 max_offset;
63
64 /* max and min transids to search for */
65 __u64 min_transid;
66 __u64 max_transid;
67
68 /* keys returned will be >= min and <= max */
69 __u32 min_type;
70 __u32 max_type;
71
72 /*
73 * how many items did userland ask for, and how many are we
74 * returning
75 */
76 __u32 nr_items;
77
78 /* align to 64 bits */
79 __u32 unused;
80
81 /* some extra for later */
82 __u64 unused1;
83 __u64 unused2;
84 __u64 unused3;
85 __u64 unused4;
86};
87
88struct btrfs_ioctl_search_header {
89 __u64 transid;
90 __u64 objectid;
91 __u64 offset;
92 __u32 type;
93 __u32 len;
94};
95
96#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
97/*
98 * the buf is an array of search headers where
99 * each header is followed by the actual item
100 * the type field is expanded to 32 bits for alignment
101 */
102struct btrfs_ioctl_search_args {
103 struct btrfs_ioctl_search_key key;
104 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
105};
106
33struct btrfs_ioctl_clone_range_args { 107struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd; 108 __s64 src_fd;
35 __u64 src_offset, src_length; 109 __u64 src_offset, src_length;
36 __u64 dest_offset; 110 __u64 dest_offset;
37}; 111};
38 112
113/* flags for the defrag range ioctl */
114#define BTRFS_DEFRAG_RANGE_COMPRESS 1
115#define BTRFS_DEFRAG_RANGE_START_IO 2
116
117struct btrfs_ioctl_defrag_range_args {
118 /* start of the defrag operation */
119 __u64 start;
120
121 /* number of bytes to defrag, use (u64)-1 to say all */
122 __u64 len;
123
124 /*
125 * flags for the operation, which can include turning
126 * on compression for this one defrag
127 */
128 __u64 flags;
129
130 /*
131 * any extent bigger than this will be considered
132 * already defragged. Use 0 to take the kernel default
133 * Use 1 to say every single extent must be rewritten
134 */
135 __u32 extent_thresh;
136
137 /*
138 * which compression method to use if turning on compression
139 * for this defrag operation. If unspecified, zlib will
140 * be used
141 */
142 __u32 compress_type;
143
144 /* spare for later */
145 __u32 unused[4];
146};
147
148struct btrfs_ioctl_space_info {
149 __u64 flags;
150 __u64 total_bytes;
151 __u64 used_bytes;
152};
153
154struct btrfs_ioctl_space_args {
155 __u64 space_slots;
156 __u64 total_spaces;
157 struct btrfs_ioctl_space_info spaces[0];
158};
159
39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 160#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
40 struct btrfs_ioctl_vol_args) 161 struct btrfs_ioctl_vol_args)
41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 162#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +188,19 @@ struct btrfs_ioctl_clone_range_args {
67 struct btrfs_ioctl_vol_args) 188 struct btrfs_ioctl_vol_args)
68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ 189#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args) 190 struct btrfs_ioctl_vol_args)
191#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
192 struct btrfs_ioctl_defrag_range_args)
193#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
194 struct btrfs_ioctl_search_args)
195#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
196 struct btrfs_ioctl_ino_lookup_args)
197#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
198#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
199 struct btrfs_ioctl_space_args)
200#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
201#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
202#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
203 struct btrfs_ioctl_vol_args_v2)
204#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
205#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
70#endif 206#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 1c36e5cd8f55..6151f2ea38bb 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/gfp.h>
20#include <linux/pagemap.h> 19#include <linux/pagemap.h>
21#include <linux/spinlock.h> 20#include <linux/spinlock.h>
22#include <linux/page-flags.h> 21#include <linux/page-flags.h>
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..cc9b450399df
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,420 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/init.h>
23#include <linux/err.h>
24#include <linux/sched.h>
25#include <linux/pagemap.h>
26#include <linux/bio.h>
27#include <linux/lzo.h>
28#include "compression.h"
29
30#define LZO_LEN 4
31
32struct workspace {
33 void *mem;
34 void *buf; /* where compressed data goes */
35 void *cbuf; /* where decompressed data goes */
36 struct list_head list;
37};
38
39static void lzo_free_workspace(struct list_head *ws)
40{
41 struct workspace *workspace = list_entry(ws, struct workspace, list);
42
43 vfree(workspace->buf);
44 vfree(workspace->cbuf);
45 vfree(workspace->mem);
46 kfree(workspace);
47}
48
49static struct list_head *lzo_alloc_workspace(void)
50{
51 struct workspace *workspace;
52
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
54 if (!workspace)
55 return ERR_PTR(-ENOMEM);
56
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail;
62
63 INIT_LIST_HEAD(&workspace->list);
64
65 return &workspace->list;
66fail:
67 lzo_free_workspace(&workspace->list);
68 return ERR_PTR(-ENOMEM);
69}
70
71static inline void write_compress_length(char *buf, size_t len)
72{
73 __le32 dlen;
74
75 dlen = cpu_to_le32(len);
76 memcpy(buf, &dlen, LZO_LEN);
77}
78
79static inline size_t read_compress_length(char *buf)
80{
81 __le32 dlen;
82
83 memcpy(&dlen, buf, LZO_LEN);
84 return le32_to_cpu(dlen);
85}
86
87static int lzo_compress_pages(struct list_head *ws,
88 struct address_space *mapping,
89 u64 start, unsigned long len,
90 struct page **pages,
91 unsigned long nr_dest_pages,
92 unsigned long *out_pages,
93 unsigned long *total_in,
94 unsigned long *total_out,
95 unsigned long max_out)
96{
97 struct workspace *workspace = list_entry(ws, struct workspace, list);
98 int ret = 0;
99 char *data_in;
100 char *cpage_out;
101 int nr_pages = 0;
102 struct page *in_page = NULL;
103 struct page *out_page = NULL;
104 unsigned long bytes_left;
105
106 size_t in_len;
107 size_t out_len;
108 char *buf;
109 unsigned long tot_in = 0;
110 unsigned long tot_out = 0;
111 unsigned long pg_bytes_left;
112 unsigned long out_offset;
113 unsigned long bytes;
114
115 *out_pages = 0;
116 *total_out = 0;
117 *total_in = 0;
118
119 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
120 data_in = kmap(in_page);
121
122 /*
123 * store the size of all chunks of compressed data in
124 * the first 4 bytes
125 */
126 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
127 if (out_page == NULL) {
128 ret = -ENOMEM;
129 goto out;
130 }
131 cpage_out = kmap(out_page);
132 out_offset = LZO_LEN;
133 tot_out = LZO_LEN;
134 pages[0] = out_page;
135 nr_pages = 1;
136 pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
137
138 /* compress at most one page of data each time */
139 in_len = min(len, PAGE_CACHE_SIZE);
140 while (tot_in < len) {
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
145 ret);
146 ret = -1;
147 goto out;
148 }
149
150 /* store the size of this chunk of compressed data */
151 write_compress_length(cpage_out + out_offset, out_len);
152 tot_out += LZO_LEN;
153 out_offset += LZO_LEN;
154 pg_bytes_left -= LZO_LEN;
155
156 tot_in += in_len;
157 tot_out += out_len;
158
159 /* copy bytes from the working buffer into the pages */
160 buf = workspace->cbuf;
161 while (out_len) {
162 bytes = min_t(unsigned long, pg_bytes_left, out_len);
163
164 memcpy(cpage_out + out_offset, buf, bytes);
165
166 out_len -= bytes;
167 pg_bytes_left -= bytes;
168 buf += bytes;
169 out_offset += bytes;
170
171 /*
172 * we need another page for writing out.
173 *
174 * Note if there's less than 4 bytes left, we just
175 * skip to a new page.
176 */
177 if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
178 pg_bytes_left == 0) {
179 if (pg_bytes_left) {
180 memset(cpage_out + out_offset, 0,
181 pg_bytes_left);
182 tot_out += pg_bytes_left;
183 }
184
185 /* we're done, don't allocate new page */
186 if (out_len == 0 && tot_in >= len)
187 break;
188
189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL;
192 ret = -1;
193 goto out;
194 }
195
196 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
197 if (out_page == NULL) {
198 ret = -ENOMEM;
199 goto out;
200 }
201 cpage_out = kmap(out_page);
202 pages[nr_pages++] = out_page;
203
204 pg_bytes_left = PAGE_CACHE_SIZE;
205 out_offset = 0;
206 }
207 }
208
209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out)
211 goto out;
212
213 /* we're all done */
214 if (tot_in >= len)
215 break;
216
217 if (tot_out > max_out)
218 break;
219
220 bytes_left = len - tot_in;
221 kunmap(in_page);
222 page_cache_release(in_page);
223
224 start += PAGE_CACHE_SIZE;
225 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
226 data_in = kmap(in_page);
227 in_len = min(bytes_left, PAGE_CACHE_SIZE);
228 }
229
230 if (tot_out > tot_in)
231 goto out;
232
233 /* store the size of all chunks of compressed data */
234 cpage_out = kmap(pages[0]);
235 write_compress_length(cpage_out, tot_out);
236
237 kunmap(pages[0]);
238
239 ret = 0;
240 *total_out = tot_out;
241 *total_in = tot_in;
242out:
243 *out_pages = nr_pages;
244 if (out_page)
245 kunmap(out_page);
246
247 if (in_page) {
248 kunmap(in_page);
249 page_cache_release(in_page);
250 }
251
252 return ret;
253}
254
255static int lzo_decompress_biovec(struct list_head *ws,
256 struct page **pages_in,
257 u64 disk_start,
258 struct bio_vec *bvec,
259 int vcnt,
260 size_t srclen)
261{
262 struct workspace *workspace = list_entry(ws, struct workspace, list);
263 int ret = 0, ret2;
264 char *data_in;
265 unsigned long page_in_index = 0;
266 unsigned long page_out_index = 0;
267 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
268 PAGE_CACHE_SIZE;
269 unsigned long buf_start;
270 unsigned long buf_offset = 0;
271 unsigned long bytes;
272 unsigned long working_bytes;
273 unsigned long pg_offset;
274
275 size_t in_len;
276 size_t out_len;
277 unsigned long in_offset;
278 unsigned long in_page_bytes_left;
279 unsigned long tot_in;
280 unsigned long tot_out;
281 unsigned long tot_len;
282 char *buf;
283
284 data_in = kmap(pages_in[0]);
285 tot_len = read_compress_length(data_in);
286
287 tot_in = LZO_LEN;
288 in_offset = LZO_LEN;
289 tot_len = min_t(size_t, srclen, tot_len);
290 in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
291
292 tot_out = 0;
293 pg_offset = 0;
294
295 while (tot_in < tot_len) {
296 in_len = read_compress_length(data_in + in_offset);
297 in_page_bytes_left -= LZO_LEN;
298 in_offset += LZO_LEN;
299 tot_in += LZO_LEN;
300
301 tot_in += in_len;
302 working_bytes = in_len;
303
304 /* fast path: avoid using the working buffer */
305 if (in_page_bytes_left >= in_len) {
306 buf = data_in + in_offset;
307 bytes = in_len;
308 goto cont;
309 }
310
311 /* copy bytes from the pages into the working buffer */
312 buf = workspace->cbuf;
313 buf_offset = 0;
314 while (working_bytes) {
315 bytes = min(working_bytes, in_page_bytes_left);
316
317 memcpy(buf + buf_offset, data_in + in_offset, bytes);
318 buf_offset += bytes;
319cont:
320 working_bytes -= bytes;
321 in_page_bytes_left -= bytes;
322 in_offset += bytes;
323
324 /* check if we need to pick another page */
325 if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
326 || in_page_bytes_left == 0) {
327 tot_in += in_page_bytes_left;
328
329 if (working_bytes == 0 && tot_in >= tot_len)
330 break;
331
332 kunmap(pages_in[page_in_index]);
333 page_in_index++;
334 if (page_in_index >= total_pages_in) {
335 ret = -1;
336 data_in = NULL;
337 goto done;
338 }
339 data_in = kmap(pages_in[page_in_index]);
340
341 in_page_bytes_left = PAGE_CACHE_SIZE;
342 in_offset = 0;
343 }
344 }
345
346 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
347 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
348 &out_len);
349 if (ret != LZO_E_OK) {
350 printk(KERN_WARNING "btrfs decompress failed\n");
351 ret = -1;
352 break;
353 }
354
355 buf_start = tot_out;
356 tot_out += out_len;
357
358 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
359 tot_out, disk_start,
360 bvec, vcnt,
361 &page_out_index, &pg_offset);
362 if (ret2 == 0)
363 break;
364 }
365done:
366 if (data_in)
367 kunmap(pages_in[page_in_index]);
368 return ret;
369}
370
371static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
372 struct page *dest_page,
373 unsigned long start_byte,
374 size_t srclen, size_t destlen)
375{
376 struct workspace *workspace = list_entry(ws, struct workspace, list);
377 size_t in_len;
378 size_t out_len;
379 size_t tot_len;
380 int ret = 0;
381 char *kaddr;
382 unsigned long bytes;
383
384 BUG_ON(srclen < LZO_LEN);
385
386 tot_len = read_compress_length(data_in);
387 data_in += LZO_LEN;
388
389 in_len = read_compress_length(data_in);
390 data_in += LZO_LEN;
391
392 out_len = PAGE_CACHE_SIZE;
393 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
394 if (ret != LZO_E_OK) {
395 printk(KERN_WARNING "btrfs decompress failed!\n");
396 ret = -1;
397 goto out;
398 }
399
400 if (out_len < start_byte) {
401 ret = -1;
402 goto out;
403 }
404
405 bytes = min_t(unsigned long, destlen, out_len - start_byte);
406
407 kaddr = kmap_atomic(dest_page, KM_USER0);
408 memcpy(kaddr, workspace->buf + start_byte, bytes);
409 kunmap_atomic(kaddr, KM_USER0);
410out:
411 return ret;
412}
413
414struct btrfs_compress_op btrfs_lzo_compress = {
415 .alloc_workspace = lzo_alloc_workspace,
416 .free_workspace = lzo_free_workspace,
417 .compress_pages = lzo_compress_pages,
418 .decompress_biovec = lzo_decompress_biovec,
419 .decompress = lzo_decompress,
420};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..2b61e1ddcd99 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -16,7 +16,6 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/gfp.h>
20#include <linux/slab.h> 19#include <linux/slab.h>
21#include <linux/blkdev.h> 20#include <linux/blkdev.h>
22#include <linux/writeback.h> 21#include <linux/writeback.h>
@@ -125,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
125 return 1; 124 return 1;
126} 125}
127 126
127static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
128 u64 len)
129{
130 if (file_offset + len <= entry->file_offset ||
131 entry->file_offset + entry->len <= file_offset)
132 return 0;
133 return 1;
134}
135
128/* 136/*
129 * look find the first ordered struct that has this offset, otherwise 137 * look find the first ordered struct that has this offset, otherwise
130 * the first one less than this offset 138 * the first one less than this offset
@@ -162,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
162 * The tree is given a single reference on the ordered extent that was 170 * The tree is given a single reference on the ordered extent that was
163 * inserted. 171 * inserted.
164 */ 172 */
165int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
166 u64 start, u64 len, u64 disk_len, int type) 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio, int compress_type)
167{ 176{
168 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
169 struct rb_node *node; 178 struct rb_node *node;
@@ -174,36 +183,65 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 if (!entry) 183 if (!entry)
175 return -ENOMEM; 184 return -ENOMEM;
176 185
177 mutex_lock(&tree->mutex);
178 entry->file_offset = file_offset; 186 entry->file_offset = file_offset;
179 entry->start = start; 187 entry->start = start;
180 entry->len = len; 188 entry->len = len;
181 entry->disk_len = disk_len; 189 entry->disk_len = disk_len;
182 entry->bytes_left = len; 190 entry->bytes_left = len;
183 entry->inode = inode; 191 entry->inode = inode;
192 entry->compress_type = compress_type;
184 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 193 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
185 set_bit(type, &entry->flags); 194 set_bit(type, &entry->flags);
186 195
196 if (dio)
197 set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
198
187 /* one ref for the tree */ 199 /* one ref for the tree */
188 atomic_set(&entry->refs, 1); 200 atomic_set(&entry->refs, 1);
189 init_waitqueue_head(&entry->wait); 201 init_waitqueue_head(&entry->wait);
190 INIT_LIST_HEAD(&entry->list); 202 INIT_LIST_HEAD(&entry->list);
191 INIT_LIST_HEAD(&entry->root_extent_list); 203 INIT_LIST_HEAD(&entry->root_extent_list);
192 204
205 spin_lock(&tree->lock);
193 node = tree_insert(&tree->tree, file_offset, 206 node = tree_insert(&tree->tree, file_offset,
194 &entry->rb_node); 207 &entry->rb_node);
195 BUG_ON(node); 208 BUG_ON(node);
209 spin_unlock(&tree->lock);
196 210
197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 211 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
198 list_add_tail(&entry->root_extent_list, 212 list_add_tail(&entry->root_extent_list,
199 &BTRFS_I(inode)->root->fs_info->ordered_extents); 213 &BTRFS_I(inode)->root->fs_info->ordered_extents);
200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 214 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
201 215
202 mutex_unlock(&tree->mutex);
203 BUG_ON(node); 216 BUG_ON(node);
204 return 0; 217 return 0;
205} 218}
206 219
220int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
221 u64 start, u64 len, u64 disk_len, int type)
222{
223 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
224 disk_len, type, 0,
225 BTRFS_COMPRESS_NONE);
226}
227
228int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
229 u64 start, u64 len, u64 disk_len, int type)
230{
231 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
232 disk_len, type, 1,
233 BTRFS_COMPRESS_NONE);
234}
235
236int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
237 u64 start, u64 len, u64 disk_len,
238 int type, int compress_type)
239{
240 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
241 disk_len, type, 0,
242 compress_type);
243}
244
207/* 245/*
208 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted 246 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
209 * when an ordered extent is finished. If the list covers more than one 247 * when an ordered extent is finished. If the list covers more than one
@@ -216,14 +254,81 @@ int btrfs_add_ordered_sum(struct inode *inode,
216 struct btrfs_ordered_inode_tree *tree; 254 struct btrfs_ordered_inode_tree *tree;
217 255
218 tree = &BTRFS_I(inode)->ordered_tree; 256 tree = &BTRFS_I(inode)->ordered_tree;
219 mutex_lock(&tree->mutex); 257 spin_lock(&tree->lock);
220 list_add_tail(&sum->list, &entry->list); 258 list_add_tail(&sum->list, &entry->list);
221 mutex_unlock(&tree->mutex); 259 spin_unlock(&tree->lock);
222 return 0; 260 return 0;
223} 261}
224 262
225/* 263/*
226 * this is used to account for finished IO across a given range 264 * this is used to account for finished IO across a given range
265 * of the file. The IO may span ordered extents. If
266 * a given ordered_extent is completely done, 1 is returned, otherwise
267 * 0.
268 *
269 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
270 * to make sure this function only returns 1 once for a given ordered extent.
271 *
272 * file_offset is updated to one byte past the range that is recorded as
273 * complete. This allows you to walk forward in the file.
274 */
275int btrfs_dec_test_first_ordered_pending(struct inode *inode,
276 struct btrfs_ordered_extent **cached,
277 u64 *file_offset, u64 io_size)
278{
279 struct btrfs_ordered_inode_tree *tree;
280 struct rb_node *node;
281 struct btrfs_ordered_extent *entry = NULL;
282 int ret;
283 u64 dec_end;
284 u64 dec_start;
285 u64 to_dec;
286
287 tree = &BTRFS_I(inode)->ordered_tree;
288 spin_lock(&tree->lock);
289 node = tree_search(tree, *file_offset);
290 if (!node) {
291 ret = 1;
292 goto out;
293 }
294
295 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
296 if (!offset_in_entry(entry, *file_offset)) {
297 ret = 1;
298 goto out;
299 }
300
301 dec_start = max(*file_offset, entry->file_offset);
302 dec_end = min(*file_offset + io_size, entry->file_offset +
303 entry->len);
304 *file_offset = dec_end;
305 if (dec_start > dec_end) {
306 printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
307 (unsigned long long)dec_start,
308 (unsigned long long)dec_end);
309 }
310 to_dec = dec_end - dec_start;
311 if (to_dec > entry->bytes_left) {
312 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
313 (unsigned long long)entry->bytes_left,
314 (unsigned long long)to_dec);
315 }
316 entry->bytes_left -= to_dec;
317 if (entry->bytes_left == 0)
318 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
319 else
320 ret = 1;
321out:
322 if (!ret && cached && entry) {
323 *cached = entry;
324 atomic_inc(&entry->refs);
325 }
326 spin_unlock(&tree->lock);
327 return ret == 0;
328}
329
330/*
331 * this is used to account for finished IO across a given range
227 * of the file. The IO should not span ordered extents. If 332 * of the file. The IO should not span ordered extents. If
228 * a given ordered_extent is completely done, 1 is returned, otherwise 333 * a given ordered_extent is completely done, 1 is returned, otherwise
229 * 0. 334 * 0.
@@ -232,15 +337,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
232 * to make sure this function only returns 1 once for a given ordered extent. 337 * to make sure this function only returns 1 once for a given ordered extent.
233 */ 338 */
234int btrfs_dec_test_ordered_pending(struct inode *inode, 339int btrfs_dec_test_ordered_pending(struct inode *inode,
340 struct btrfs_ordered_extent **cached,
235 u64 file_offset, u64 io_size) 341 u64 file_offset, u64 io_size)
236{ 342{
237 struct btrfs_ordered_inode_tree *tree; 343 struct btrfs_ordered_inode_tree *tree;
238 struct rb_node *node; 344 struct rb_node *node;
239 struct btrfs_ordered_extent *entry; 345 struct btrfs_ordered_extent *entry = NULL;
240 int ret; 346 int ret;
241 347
242 tree = &BTRFS_I(inode)->ordered_tree; 348 tree = &BTRFS_I(inode)->ordered_tree;
243 mutex_lock(&tree->mutex); 349 spin_lock(&tree->lock);
244 node = tree_search(tree, file_offset); 350 node = tree_search(tree, file_offset);
245 if (!node) { 351 if (!node) {
246 ret = 1; 352 ret = 1;
@@ -264,7 +370,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
264 else 370 else
265 ret = 1; 371 ret = 1;
266out: 372out:
267 mutex_unlock(&tree->mutex); 373 if (!ret && cached && entry) {
374 *cached = entry;
375 atomic_inc(&entry->refs);
376 }
377 spin_unlock(&tree->lock);
268 return ret == 0; 378 return ret == 0;
269} 379}
270 380
@@ -291,13 +401,14 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
291 401
292/* 402/*
293 * remove an ordered extent from the tree. No references are dropped 403 * remove an ordered extent from the tree. No references are dropped
294 * and you must wake_up entry->wait. You must hold the tree mutex 404 * and you must wake_up entry->wait. You must hold the tree lock
295 * while you call this function. 405 * while you call this function.
296 */ 406 */
297static int __btrfs_remove_ordered_extent(struct inode *inode, 407static int __btrfs_remove_ordered_extent(struct inode *inode,
298 struct btrfs_ordered_extent *entry) 408 struct btrfs_ordered_extent *entry)
299{ 409{
300 struct btrfs_ordered_inode_tree *tree; 410 struct btrfs_ordered_inode_tree *tree;
411 struct btrfs_root *root = BTRFS_I(inode)->root;
301 struct rb_node *node; 412 struct rb_node *node;
302 413
303 tree = &BTRFS_I(inode)->ordered_tree; 414 tree = &BTRFS_I(inode)->ordered_tree;
@@ -306,13 +417,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
306 tree->last = NULL; 417 tree->last = NULL;
307 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); 418 set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
308 419
309 spin_lock(&BTRFS_I(inode)->accounting_lock); 420 spin_lock(&root->fs_info->ordered_extent_lock);
310 BTRFS_I(inode)->outstanding_extents--;
311 spin_unlock(&BTRFS_I(inode)->accounting_lock);
312 btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
313 inode, 1);
314
315 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
316 list_del_init(&entry->root_extent_list); 421 list_del_init(&entry->root_extent_list);
317 422
318 /* 423 /*
@@ -324,7 +429,7 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
324 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) { 429 !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
325 list_del_init(&BTRFS_I(inode)->ordered_operations); 430 list_del_init(&BTRFS_I(inode)->ordered_operations);
326 } 431 }
327 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 432 spin_unlock(&root->fs_info->ordered_extent_lock);
328 433
329 return 0; 434 return 0;
330} 435}
@@ -340,9 +445,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
340 int ret; 445 int ret;
341 446
342 tree = &BTRFS_I(inode)->ordered_tree; 447 tree = &BTRFS_I(inode)->ordered_tree;
343 mutex_lock(&tree->mutex); 448 spin_lock(&tree->lock);
344 ret = __btrfs_remove_ordered_extent(inode, entry); 449 ret = __btrfs_remove_ordered_extent(inode, entry);
345 mutex_unlock(&tree->mutex); 450 spin_unlock(&tree->lock);
346 wake_up(&entry->wait); 451 wake_up(&entry->wait);
347 452
348 return ret; 453 return ret;
@@ -485,7 +590,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
485 * start IO on any dirty ones so the wait doesn't stall waiting 590 * start IO on any dirty ones so the wait doesn't stall waiting
486 * for pdflush to find them 591 * for pdflush to find them
487 */ 592 */
488 filemap_fdatawrite_range(inode->i_mapping, start, end); 593 if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
594 filemap_fdatawrite_range(inode->i_mapping, start, end);
489 if (wait) { 595 if (wait) {
490 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, 596 wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
491 &entry->flags)); 597 &entry->flags));
@@ -499,7 +605,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
499{ 605{
500 u64 end; 606 u64 end;
501 u64 orig_end; 607 u64 orig_end;
502 u64 wait_end;
503 struct btrfs_ordered_extent *ordered; 608 struct btrfs_ordered_extent *ordered;
504 int found; 609 int found;
505 610
@@ -510,7 +615,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
510 if (orig_end > INT_LIMIT(loff_t)) 615 if (orig_end > INT_LIMIT(loff_t))
511 orig_end = INT_LIMIT(loff_t); 616 orig_end = INT_LIMIT(loff_t);
512 } 617 }
513 wait_end = orig_end;
514again: 618again:
515 /* start IO across the range first to instantiate any delalloc 619 /* start IO across the range first to instantiate any delalloc
516 * extents 620 * extents
@@ -567,7 +671,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
567 struct btrfs_ordered_extent *entry = NULL; 671 struct btrfs_ordered_extent *entry = NULL;
568 672
569 tree = &BTRFS_I(inode)->ordered_tree; 673 tree = &BTRFS_I(inode)->ordered_tree;
570 mutex_lock(&tree->mutex); 674 spin_lock(&tree->lock);
571 node = tree_search(tree, file_offset); 675 node = tree_search(tree, file_offset);
572 if (!node) 676 if (!node)
573 goto out; 677 goto out;
@@ -578,7 +682,48 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
578 if (entry) 682 if (entry)
579 atomic_inc(&entry->refs); 683 atomic_inc(&entry->refs);
580out: 684out:
581 mutex_unlock(&tree->mutex); 685 spin_unlock(&tree->lock);
686 return entry;
687}
688
689/* Since the DIO code tries to lock a wide area we need to look for any ordered
690 * extents that exist in the range, rather than just the start of the range.
691 */
692struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
693 u64 file_offset,
694 u64 len)
695{
696 struct btrfs_ordered_inode_tree *tree;
697 struct rb_node *node;
698 struct btrfs_ordered_extent *entry = NULL;
699
700 tree = &BTRFS_I(inode)->ordered_tree;
701 spin_lock(&tree->lock);
702 node = tree_search(tree, file_offset);
703 if (!node) {
704 node = tree_search(tree, file_offset + len);
705 if (!node)
706 goto out;
707 }
708
709 while (1) {
710 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
711 if (range_overlaps(entry, file_offset, len))
712 break;
713
714 if (entry->file_offset >= file_offset + len) {
715 entry = NULL;
716 break;
717 }
718 entry = NULL;
719 node = rb_next(node);
720 if (!node)
721 break;
722 }
723out:
724 if (entry)
725 atomic_inc(&entry->refs);
726 spin_unlock(&tree->lock);
582 return entry; 727 return entry;
583} 728}
584 729
@@ -594,7 +739,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
594 struct btrfs_ordered_extent *entry = NULL; 739 struct btrfs_ordered_extent *entry = NULL;
595 740
596 tree = &BTRFS_I(inode)->ordered_tree; 741 tree = &BTRFS_I(inode)->ordered_tree;
597 mutex_lock(&tree->mutex); 742 spin_lock(&tree->lock);
598 node = tree_search(tree, file_offset); 743 node = tree_search(tree, file_offset);
599 if (!node) 744 if (!node)
600 goto out; 745 goto out;
@@ -602,7 +747,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
602 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 747 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
603 atomic_inc(&entry->refs); 748 atomic_inc(&entry->refs);
604out: 749out:
605 mutex_unlock(&tree->mutex); 750 spin_unlock(&tree->lock);
606 return entry; 751 return entry;
607} 752}
608 753
@@ -629,7 +774,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
629 else 774 else
630 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 775 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
631 776
632 mutex_lock(&tree->mutex); 777 spin_lock(&tree->lock);
633 disk_i_size = BTRFS_I(inode)->disk_i_size; 778 disk_i_size = BTRFS_I(inode)->disk_i_size;
634 779
635 /* truncate file */ 780 /* truncate file */
@@ -735,7 +880,7 @@ out:
735 */ 880 */
736 if (ordered) 881 if (ordered)
737 __btrfs_remove_ordered_extent(inode, ordered); 882 __btrfs_remove_ordered_extent(inode, ordered);
738 mutex_unlock(&tree->mutex); 883 spin_unlock(&tree->lock);
739 if (ordered) 884 if (ordered)
740 wake_up(&ordered->wait); 885 wake_up(&ordered->wait);
741 return ret; 886 return ret;
@@ -762,7 +907,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
762 if (!ordered) 907 if (!ordered)
763 return 1; 908 return 1;
764 909
765 mutex_lock(&tree->mutex); 910 spin_lock(&tree->lock);
766 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 911 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
767 if (disk_bytenr >= ordered_sum->bytenr) { 912 if (disk_bytenr >= ordered_sum->bytenr) {
768 num_sectors = ordered_sum->len / sectorsize; 913 num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +922,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
777 } 922 }
778 } 923 }
779out: 924out:
780 mutex_unlock(&tree->mutex); 925 spin_unlock(&tree->lock);
781 btrfs_put_ordered_extent(ordered); 926 btrfs_put_ordered_extent(ordered);
782 return ret; 927 return ret;
783} 928}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 1fe1282ef47c..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
21 21
22/* one of these per inode */ 22/* one of these per inode */
23struct btrfs_ordered_inode_tree { 23struct btrfs_ordered_inode_tree {
24 struct mutex mutex; 24 spinlock_t lock;
25 struct rb_root tree; 25 struct rb_root tree;
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
@@ -68,10 +68,12 @@ struct btrfs_ordered_sum {
68 68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70 70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ 71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
75#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
76
75struct btrfs_ordered_extent { 77struct btrfs_ordered_extent {
76 /* logical offset in the file */ 78 /* logical offset in the file */
77 u64 file_offset; 79 u64 file_offset;
@@ -91,6 +93,9 @@ struct btrfs_ordered_extent {
91 /* flags (described above) */ 93 /* flags (described above) */
92 unsigned long flags; 94 unsigned long flags;
93 95
96 /* compression algorithm */
97 int compress_type;
98
94 /* reference count */ 99 /* reference count */
95 atomic_t refs; 100 atomic_t refs;
96 101
@@ -128,8 +133,8 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
128static inline void 133static inline void
129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) 134btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
130{ 135{
131 mutex_init(&t->mutex); 136 spin_lock_init(&t->lock);
132 t->tree.rb_node = NULL; 137 t->tree = RB_ROOT;
133 t->last = NULL; 138 t->last = NULL;
134} 139}
135 140
@@ -137,9 +142,18 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
137int btrfs_remove_ordered_extent(struct inode *inode, 142int btrfs_remove_ordered_extent(struct inode *inode,
138 struct btrfs_ordered_extent *entry); 143 struct btrfs_ordered_extent *entry);
139int btrfs_dec_test_ordered_pending(struct inode *inode, 144int btrfs_dec_test_ordered_pending(struct inode *inode,
140 u64 file_offset, u64 io_size); 145 struct btrfs_ordered_extent **cached,
146 u64 file_offset, u64 io_size);
147int btrfs_dec_test_first_ordered_pending(struct inode *inode,
148 struct btrfs_ordered_extent **cached,
149 u64 *file_offset, u64 io_size);
141int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 150int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
142 u64 start, u64 len, u64 disk_len, int tyep); 151 u64 start, u64 len, u64 disk_len, int type);
152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
153 u64 start, u64 len, u64 disk_len, int type);
154int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, u64 disk_len,
156 int type, int compress_type);
143int btrfs_add_ordered_sum(struct inode *inode, 157int btrfs_add_ordered_sum(struct inode *inode,
144 struct btrfs_ordered_extent *entry, 158 struct btrfs_ordered_extent *entry,
145 struct btrfs_ordered_sum *sum); 159 struct btrfs_ordered_sum *sum);
@@ -150,6 +164,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
150int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); 164int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
151struct btrfs_ordered_extent * 165struct btrfs_ordered_extent *
152btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); 166btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
167struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
168 u64 file_offset,
169 u64 len);
153int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, 170int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
154 struct btrfs_ordered_extent *ordered); 171 struct btrfs_ordered_extent *ordered);
155int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); 172int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28e..f8be250963a0 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret) 59 if (ret < 0)
60 goto out; 60 goto out;
61 if (ret) {
62 ret = -ENOENT;
63 goto out;
64 }
61 65
62 ret = btrfs_del_item(trans, root, path); 66 ret = btrfs_del_item(trans, root, path);
63 67
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index d0cc62bccb94..a97314cf6bd6 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include <linux/sort.h> 21#include <linux/sort.h>
21#include "ctree.h" 22#include "ctree.h"
22#include "ref-cache.h" 23#include "ref-cache.h"
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index bc283ad2db73..e2a55cb2072b 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -52,7 +52,7 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
52 52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) 53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{ 54{
55 tree->root.rb_node = NULL; 55 tree->root = RB_ROOT;
56 INIT_LIST_HEAD(&tree->list); 56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock); 57 spin_lock_init(&tree->lock);
58} 58}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ed3e4a2ec2c8..045c9c2b2d7e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -21,6 +21,7 @@
21#include <linux/writeback.h> 21#include <linux/writeback.h>
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/slab.h>
24#include "ctree.h" 25#include "ctree.h"
25#include "disk-io.h" 26#include "disk-io.h"
26#include "transaction.h" 27#include "transaction.h"
@@ -28,6 +29,7 @@
28#include "locking.h" 29#include "locking.h"
29#include "btrfs_inode.h" 30#include "btrfs_inode.h"
30#include "async-thread.h" 31#include "async-thread.h"
32#include "free-space-cache.h"
31 33
32/* 34/*
33 * backref_node, mapping_node and tree_block start with this 35 * backref_node, mapping_node and tree_block start with this
@@ -43,8 +45,12 @@ struct tree_entry {
43struct backref_node { 45struct backref_node {
44 struct rb_node rb_node; 46 struct rb_node rb_node;
45 u64 bytenr; 47 u64 bytenr;
46 /* objectid tree block owner */ 48
49 u64 new_bytenr;
50 /* objectid of tree block owner, can be not uptodate */
47 u64 owner; 51 u64 owner;
52 /* link to pending, changed or detached list */
53 struct list_head list;
48 /* list of upper level blocks reference this block */ 54 /* list of upper level blocks reference this block */
49 struct list_head upper; 55 struct list_head upper;
50 /* list of child blocks in the cache */ 56 /* list of child blocks in the cache */
@@ -55,9 +61,9 @@ struct backref_node {
55 struct extent_buffer *eb; 61 struct extent_buffer *eb;
56 /* level of tree block */ 62 /* level of tree block */
57 unsigned int level:8; 63 unsigned int level:8;
58 /* 1 if the block is root of old snapshot */ 64 /* is the block in non-reference counted tree */
59 unsigned int old_root:1; 65 unsigned int cowonly:1;
60 /* 1 if no child blocks in the cache */ 66 /* 1 if no child node in the cache */
61 unsigned int lowest:1; 67 unsigned int lowest:1;
62 /* is the extent buffer locked */ 68 /* is the extent buffer locked */
63 unsigned int locked:1; 69 unsigned int locked:1;
@@ -65,6 +71,16 @@ struct backref_node {
65 unsigned int processed:1; 71 unsigned int processed:1;
66 /* have backrefs of this block been checked */ 72 /* have backrefs of this block been checked */
67 unsigned int checked:1; 73 unsigned int checked:1;
74 /*
75 * 1 if corresponding block has been cowed but some upper
76 * level block pointers may not point to the new location
77 */
78 unsigned int pending:1;
79 /*
80 * 1 if the backref node isn't connected to any other
81 * backref node.
82 */
83 unsigned int detached:1;
68}; 84};
69 85
70/* 86/*
@@ -73,7 +89,6 @@ struct backref_node {
73struct backref_edge { 89struct backref_edge {
74 struct list_head list[2]; 90 struct list_head list[2];
75 struct backref_node *node[2]; 91 struct backref_node *node[2];
76 u64 blockptr;
77}; 92};
78 93
79#define LOWER 0 94#define LOWER 0
@@ -82,9 +97,25 @@ struct backref_edge {
82struct backref_cache { 97struct backref_cache {
83 /* red black tree of all backref nodes in the cache */ 98 /* red black tree of all backref nodes in the cache */
84 struct rb_root rb_root; 99 struct rb_root rb_root;
85 /* list of backref nodes with no child block in the cache */ 100 /* for passing backref nodes to btrfs_reloc_cow_block */
101 struct backref_node *path[BTRFS_MAX_LEVEL];
102 /*
103 * list of blocks that have been cowed but some block
104 * pointers in upper level blocks may not reflect the
105 * new location
106 */
86 struct list_head pending[BTRFS_MAX_LEVEL]; 107 struct list_head pending[BTRFS_MAX_LEVEL];
87 spinlock_t lock; 108 /* list of backref nodes with no child node */
109 struct list_head leaves;
110 /* list of blocks that have been cowed in current transaction */
111 struct list_head changed;
112 /* list of detached backref node. */
113 struct list_head detached;
114
115 u64 last_trans;
116
117 int nr_nodes;
118 int nr_edges;
88}; 119};
89 120
90/* 121/*
@@ -112,15 +143,6 @@ struct tree_block {
112 unsigned int key_ready:1; 143 unsigned int key_ready:1;
113}; 144};
114 145
115/* inode vector */
116#define INODEVEC_SIZE 16
117
118struct inodevec {
119 struct list_head list;
120 struct inode *inode[INODEVEC_SIZE];
121 int nr;
122};
123
124#define MAX_EXTENTS 128 146#define MAX_EXTENTS 128
125 147
126struct file_extent_cluster { 148struct file_extent_cluster {
@@ -137,58 +159,128 @@ struct reloc_control {
137 struct btrfs_root *extent_root; 159 struct btrfs_root *extent_root;
138 /* inode for moving data */ 160 /* inode for moving data */
139 struct inode *data_inode; 161 struct inode *data_inode;
140 struct btrfs_workers workers; 162
163 struct btrfs_block_rsv *block_rsv;
164
165 struct backref_cache backref_cache;
166
167 struct file_extent_cluster cluster;
141 /* tree blocks have been processed */ 168 /* tree blocks have been processed */
142 struct extent_io_tree processed_blocks; 169 struct extent_io_tree processed_blocks;
143 /* map start of tree root to corresponding reloc tree */ 170 /* map start of tree root to corresponding reloc tree */
144 struct mapping_tree reloc_root_tree; 171 struct mapping_tree reloc_root_tree;
145 /* list of reloc trees */ 172 /* list of reloc trees */
146 struct list_head reloc_roots; 173 struct list_head reloc_roots;
174 /* size of metadata reservation for merging reloc trees */
175 u64 merging_rsv_size;
176 /* size of relocated tree nodes */
177 u64 nodes_relocated;
178
147 u64 search_start; 179 u64 search_start;
148 u64 extents_found; 180 u64 extents_found;
149 u64 extents_skipped; 181
150 int stage; 182 unsigned int stage:8;
151 int create_reloc_root; 183 unsigned int create_reloc_tree:1;
184 unsigned int merge_reloc_tree:1;
152 unsigned int found_file_extent:1; 185 unsigned int found_file_extent:1;
153 unsigned int found_old_snapshot:1; 186 unsigned int commit_transaction:1;
154}; 187};
155 188
156/* stages of data relocation */ 189/* stages of data relocation */
157#define MOVE_DATA_EXTENTS 0 190#define MOVE_DATA_EXTENTS 0
158#define UPDATE_DATA_PTRS 1 191#define UPDATE_DATA_PTRS 1
159 192
160/* 193static void remove_backref_node(struct backref_cache *cache,
161 * merge reloc tree to corresponding fs tree in worker threads 194 struct backref_node *node);
162 */ 195static void __mark_block_processed(struct reloc_control *rc,
163struct async_merge { 196 struct backref_node *node);
164 struct btrfs_work work;
165 struct reloc_control *rc;
166 struct btrfs_root *root;
167 struct completion *done;
168 atomic_t *num_pending;
169};
170 197
171static void mapping_tree_init(struct mapping_tree *tree) 198static void mapping_tree_init(struct mapping_tree *tree)
172{ 199{
173 tree->rb_root.rb_node = NULL; 200 tree->rb_root = RB_ROOT;
174 spin_lock_init(&tree->lock); 201 spin_lock_init(&tree->lock);
175} 202}
176 203
177static void backref_cache_init(struct backref_cache *cache) 204static void backref_cache_init(struct backref_cache *cache)
178{ 205{
179 int i; 206 int i;
180 cache->rb_root.rb_node = NULL; 207 cache->rb_root = RB_ROOT;
181 for (i = 0; i < BTRFS_MAX_LEVEL; i++) 208 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
182 INIT_LIST_HEAD(&cache->pending[i]); 209 INIT_LIST_HEAD(&cache->pending[i]);
183 spin_lock_init(&cache->lock); 210 INIT_LIST_HEAD(&cache->changed);
211 INIT_LIST_HEAD(&cache->detached);
212 INIT_LIST_HEAD(&cache->leaves);
184} 213}
185 214
186static void backref_node_init(struct backref_node *node) 215static void backref_cache_cleanup(struct backref_cache *cache)
187{ 216{
188 memset(node, 0, sizeof(*node)); 217 struct backref_node *node;
189 INIT_LIST_HEAD(&node->upper); 218 int i;
190 INIT_LIST_HEAD(&node->lower); 219
191 RB_CLEAR_NODE(&node->rb_node); 220 while (!list_empty(&cache->detached)) {
221 node = list_entry(cache->detached.next,
222 struct backref_node, list);
223 remove_backref_node(cache, node);
224 }
225
226 while (!list_empty(&cache->leaves)) {
227 node = list_entry(cache->leaves.next,
228 struct backref_node, lower);
229 remove_backref_node(cache, node);
230 }
231
232 cache->last_trans = 0;
233
234 for (i = 0; i < BTRFS_MAX_LEVEL; i++)
235 BUG_ON(!list_empty(&cache->pending[i]));
236 BUG_ON(!list_empty(&cache->changed));
237 BUG_ON(!list_empty(&cache->detached));
238 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
239 BUG_ON(cache->nr_nodes);
240 BUG_ON(cache->nr_edges);
241}
242
243static struct backref_node *alloc_backref_node(struct backref_cache *cache)
244{
245 struct backref_node *node;
246
247 node = kzalloc(sizeof(*node), GFP_NOFS);
248 if (node) {
249 INIT_LIST_HEAD(&node->list);
250 INIT_LIST_HEAD(&node->upper);
251 INIT_LIST_HEAD(&node->lower);
252 RB_CLEAR_NODE(&node->rb_node);
253 cache->nr_nodes++;
254 }
255 return node;
256}
257
258static void free_backref_node(struct backref_cache *cache,
259 struct backref_node *node)
260{
261 if (node) {
262 cache->nr_nodes--;
263 kfree(node);
264 }
265}
266
267static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
268{
269 struct backref_edge *edge;
270
271 edge = kzalloc(sizeof(*edge), GFP_NOFS);
272 if (edge)
273 cache->nr_edges++;
274 return edge;
275}
276
277static void free_backref_edge(struct backref_cache *cache,
278 struct backref_edge *edge)
279{
280 if (edge) {
281 cache->nr_edges--;
282 kfree(edge);
283 }
192} 284}
193 285
194static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 286static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -249,6 +341,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
249 edges[idx++] = edge; 341 edges[idx++] = edge;
250 node = edge->node[UPPER]; 342 node = edge->node[UPPER];
251 } 343 }
344 BUG_ON(node->detached);
252 *index = idx; 345 *index = idx;
253 return node; 346 return node;
254} 347}
@@ -280,13 +373,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
280 return NULL; 373 return NULL;
281} 374}
282 375
376static void unlock_node_buffer(struct backref_node *node)
377{
378 if (node->locked) {
379 btrfs_tree_unlock(node->eb);
380 node->locked = 0;
381 }
382}
383
283static void drop_node_buffer(struct backref_node *node) 384static void drop_node_buffer(struct backref_node *node)
284{ 385{
285 if (node->eb) { 386 if (node->eb) {
286 if (node->locked) { 387 unlock_node_buffer(node);
287 btrfs_tree_unlock(node->eb);
288 node->locked = 0;
289 }
290 free_extent_buffer(node->eb); 388 free_extent_buffer(node->eb);
291 node->eb = NULL; 389 node->eb = NULL;
292 } 390 }
@@ -295,14 +393,14 @@ static void drop_node_buffer(struct backref_node *node)
295static void drop_backref_node(struct backref_cache *tree, 393static void drop_backref_node(struct backref_cache *tree,
296 struct backref_node *node) 394 struct backref_node *node)
297{ 395{
298 BUG_ON(!node->lowest);
299 BUG_ON(!list_empty(&node->upper)); 396 BUG_ON(!list_empty(&node->upper));
300 397
301 drop_node_buffer(node); 398 drop_node_buffer(node);
399 list_del(&node->list);
302 list_del(&node->lower); 400 list_del(&node->lower);
303 401 if (!RB_EMPTY_NODE(&node->rb_node))
304 rb_erase(&node->rb_node, &tree->rb_root); 402 rb_erase(&node->rb_node, &tree->rb_root);
305 kfree(node); 403 free_backref_node(tree, node);
306} 404}
307 405
308/* 406/*
@@ -317,27 +415,121 @@ static void remove_backref_node(struct backref_cache *cache,
317 if (!node) 415 if (!node)
318 return; 416 return;
319 417
320 BUG_ON(!node->lowest); 418 BUG_ON(!node->lowest && !node->detached);
321 while (!list_empty(&node->upper)) { 419 while (!list_empty(&node->upper)) {
322 edge = list_entry(node->upper.next, struct backref_edge, 420 edge = list_entry(node->upper.next, struct backref_edge,
323 list[LOWER]); 421 list[LOWER]);
324 upper = edge->node[UPPER]; 422 upper = edge->node[UPPER];
325 list_del(&edge->list[LOWER]); 423 list_del(&edge->list[LOWER]);
326 list_del(&edge->list[UPPER]); 424 list_del(&edge->list[UPPER]);
327 kfree(edge); 425 free_backref_edge(cache, edge);
426
427 if (RB_EMPTY_NODE(&upper->rb_node)) {
428 BUG_ON(!list_empty(&node->upper));
429 drop_backref_node(cache, node);
430 node = upper;
431 node->lowest = 1;
432 continue;
433 }
328 /* 434 /*
329 * add the node to pending list if no other 435 * add the node to leaf node list if no other
330 * child block cached. 436 * child block cached.
331 */ 437 */
332 if (list_empty(&upper->lower)) { 438 if (list_empty(&upper->lower)) {
333 list_add_tail(&upper->lower, 439 list_add_tail(&upper->lower, &cache->leaves);
334 &cache->pending[upper->level]);
335 upper->lowest = 1; 440 upper->lowest = 1;
336 } 441 }
337 } 442 }
443
338 drop_backref_node(cache, node); 444 drop_backref_node(cache, node);
339} 445}
340 446
447static void update_backref_node(struct backref_cache *cache,
448 struct backref_node *node, u64 bytenr)
449{
450 struct rb_node *rb_node;
451 rb_erase(&node->rb_node, &cache->rb_root);
452 node->bytenr = bytenr;
453 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
454 BUG_ON(rb_node);
455}
456
457/*
458 * update backref cache after a transaction commit
459 */
460static int update_backref_cache(struct btrfs_trans_handle *trans,
461 struct backref_cache *cache)
462{
463 struct backref_node *node;
464 int level = 0;
465
466 if (cache->last_trans == 0) {
467 cache->last_trans = trans->transid;
468 return 0;
469 }
470
471 if (cache->last_trans == trans->transid)
472 return 0;
473
474 /*
475 * detached nodes are used to avoid unnecessary backref
476 * lookup. transaction commit changes the extent tree.
477 * so the detached nodes are no longer useful.
478 */
479 while (!list_empty(&cache->detached)) {
480 node = list_entry(cache->detached.next,
481 struct backref_node, list);
482 remove_backref_node(cache, node);
483 }
484
485 while (!list_empty(&cache->changed)) {
486 node = list_entry(cache->changed.next,
487 struct backref_node, list);
488 list_del_init(&node->list);
489 BUG_ON(node->pending);
490 update_backref_node(cache, node, node->new_bytenr);
491 }
492
493 /*
494 * some nodes can be left in the pending list if there were
495 * errors during processing the pending nodes.
496 */
497 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
498 list_for_each_entry(node, &cache->pending[level], list) {
499 BUG_ON(!node->pending);
500 if (node->bytenr == node->new_bytenr)
501 continue;
502 update_backref_node(cache, node, node->new_bytenr);
503 }
504 }
505
506 cache->last_trans = 0;
507 return 1;
508}
509
510static int should_ignore_root(struct btrfs_root *root)
511{
512 struct btrfs_root *reloc_root;
513
514 if (!root->ref_cows)
515 return 0;
516
517 reloc_root = root->reloc_root;
518 if (!reloc_root)
519 return 0;
520
521 if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
522 root->fs_info->running_transaction->transid - 1)
523 return 0;
524 /*
525 * if there is reloc tree and it was created in previous
526 * transaction backref lookup can find the reloc tree,
527 * so backref node for the fs tree root is useless for
528 * relocation.
529 */
530 return 1;
531}
532
341/* 533/*
342 * find reloc tree by address of tree root 534 * find reloc tree by address of tree root
343 */ 535 */
@@ -452,11 +644,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
452 * for all upper level blocks that directly/indirectly reference the 644 * for all upper level blocks that directly/indirectly reference the
453 * block are also cached. 645 * block are also cached.
454 */ 646 */
455static struct backref_node *build_backref_tree(struct reloc_control *rc, 647static noinline_for_stack
456 struct backref_cache *cache, 648struct backref_node *build_backref_tree(struct reloc_control *rc,
457 struct btrfs_key *node_key, 649 struct btrfs_key *node_key,
458 int level, u64 bytenr) 650 int level, u64 bytenr)
459{ 651{
652 struct backref_cache *cache = &rc->backref_cache;
460 struct btrfs_path *path1; 653 struct btrfs_path *path1;
461 struct btrfs_path *path2; 654 struct btrfs_path *path2;
462 struct extent_buffer *eb; 655 struct extent_buffer *eb;
@@ -472,6 +665,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
472 unsigned long end; 665 unsigned long end;
473 unsigned long ptr; 666 unsigned long ptr;
474 LIST_HEAD(list); 667 LIST_HEAD(list);
668 LIST_HEAD(useless);
669 int cowonly;
475 int ret; 670 int ret;
476 int err = 0; 671 int err = 0;
477 672
@@ -482,15 +677,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
482 goto out; 677 goto out;
483 } 678 }
484 679
485 node = kmalloc(sizeof(*node), GFP_NOFS); 680 node = alloc_backref_node(cache);
486 if (!node) { 681 if (!node) {
487 err = -ENOMEM; 682 err = -ENOMEM;
488 goto out; 683 goto out;
489 } 684 }
490 685
491 backref_node_init(node);
492 node->bytenr = bytenr; 686 node->bytenr = bytenr;
493 node->owner = 0;
494 node->level = level; 687 node->level = level;
495 node->lowest = 1; 688 node->lowest = 1;
496 cur = node; 689 cur = node;
@@ -586,17 +779,21 @@ again:
586#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 779#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
587 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY || 780 if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
588 key.type == BTRFS_EXTENT_REF_V0_KEY) { 781 key.type == BTRFS_EXTENT_REF_V0_KEY) {
589 if (key.objectid == key.offset && 782 if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
590 key.type == BTRFS_EXTENT_REF_V0_KEY) {
591 struct btrfs_extent_ref_v0 *ref0; 783 struct btrfs_extent_ref_v0 *ref0;
592 ref0 = btrfs_item_ptr(eb, path1->slots[0], 784 ref0 = btrfs_item_ptr(eb, path1->slots[0],
593 struct btrfs_extent_ref_v0); 785 struct btrfs_extent_ref_v0);
594 root = find_tree_root(rc, eb, ref0); 786 if (key.objectid == key.offset) {
595 if (root) 787 root = find_tree_root(rc, eb, ref0);
596 cur->root = root; 788 if (root && !should_ignore_root(root))
597 else 789 cur->root = root;
598 cur->old_root = 1; 790 else
599 break; 791 list_add(&cur->list, &useless);
792 break;
793 }
794 if (is_cowonly_root(btrfs_ref_root_v0(eb,
795 ref0)))
796 cur->cowonly = 1;
600 } 797 }
601#else 798#else
602 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); 799 BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -613,22 +810,20 @@ again:
613 break; 810 break;
614 } 811 }
615 812
616 edge = kzalloc(sizeof(*edge), GFP_NOFS); 813 edge = alloc_backref_edge(cache);
617 if (!edge) { 814 if (!edge) {
618 err = -ENOMEM; 815 err = -ENOMEM;
619 goto out; 816 goto out;
620 } 817 }
621 rb_node = tree_search(&cache->rb_root, key.offset); 818 rb_node = tree_search(&cache->rb_root, key.offset);
622 if (!rb_node) { 819 if (!rb_node) {
623 upper = kmalloc(sizeof(*upper), GFP_NOFS); 820 upper = alloc_backref_node(cache);
624 if (!upper) { 821 if (!upper) {
625 kfree(edge); 822 free_backref_edge(cache, edge);
626 err = -ENOMEM; 823 err = -ENOMEM;
627 goto out; 824 goto out;
628 } 825 }
629 backref_node_init(upper);
630 upper->bytenr = key.offset; 826 upper->bytenr = key.offset;
631 upper->owner = 0;
632 upper->level = cur->level + 1; 827 upper->level = cur->level + 1;
633 /* 828 /*
634 * backrefs for the upper level block isn't 829 * backrefs for the upper level block isn't
@@ -638,11 +833,12 @@ again:
638 } else { 833 } else {
639 upper = rb_entry(rb_node, struct backref_node, 834 upper = rb_entry(rb_node, struct backref_node,
640 rb_node); 835 rb_node);
836 BUG_ON(!upper->checked);
641 INIT_LIST_HEAD(&edge->list[UPPER]); 837 INIT_LIST_HEAD(&edge->list[UPPER]);
642 } 838 }
643 list_add(&edge->list[LOWER], &cur->upper); 839 list_add_tail(&edge->list[LOWER], &cur->upper);
644 edge->node[UPPER] = upper;
645 edge->node[LOWER] = cur; 840 edge->node[LOWER] = cur;
841 edge->node[UPPER] = upper;
646 842
647 goto next; 843 goto next;
648 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) { 844 } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -656,11 +852,17 @@ again:
656 goto out; 852 goto out;
657 } 853 }
658 854
855 if (!root->ref_cows)
856 cur->cowonly = 1;
857
659 if (btrfs_root_level(&root->root_item) == cur->level) { 858 if (btrfs_root_level(&root->root_item) == cur->level) {
660 /* tree root */ 859 /* tree root */
661 BUG_ON(btrfs_root_bytenr(&root->root_item) != 860 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
662 cur->bytenr); 861 cur->bytenr);
663 cur->root = root; 862 if (should_ignore_root(root))
863 list_add(&cur->list, &useless);
864 else
865 cur->root = root;
664 break; 866 break;
665 } 867 }
666 868
@@ -691,11 +893,14 @@ again:
691 if (!path2->nodes[level]) { 893 if (!path2->nodes[level]) {
692 BUG_ON(btrfs_root_bytenr(&root->root_item) != 894 BUG_ON(btrfs_root_bytenr(&root->root_item) !=
693 lower->bytenr); 895 lower->bytenr);
694 lower->root = root; 896 if (should_ignore_root(root))
897 list_add(&lower->list, &useless);
898 else
899 lower->root = root;
695 break; 900 break;
696 } 901 }
697 902
698 edge = kzalloc(sizeof(*edge), GFP_NOFS); 903 edge = alloc_backref_edge(cache);
699 if (!edge) { 904 if (!edge) {
700 err = -ENOMEM; 905 err = -ENOMEM;
701 goto out; 906 goto out;
@@ -704,16 +909,17 @@ again:
704 eb = path2->nodes[level]; 909 eb = path2->nodes[level];
705 rb_node = tree_search(&cache->rb_root, eb->start); 910 rb_node = tree_search(&cache->rb_root, eb->start);
706 if (!rb_node) { 911 if (!rb_node) {
707 upper = kmalloc(sizeof(*upper), GFP_NOFS); 912 upper = alloc_backref_node(cache);
708 if (!upper) { 913 if (!upper) {
709 kfree(edge); 914 free_backref_edge(cache, edge);
710 err = -ENOMEM; 915 err = -ENOMEM;
711 goto out; 916 goto out;
712 } 917 }
713 backref_node_init(upper);
714 upper->bytenr = eb->start; 918 upper->bytenr = eb->start;
715 upper->owner = btrfs_header_owner(eb); 919 upper->owner = btrfs_header_owner(eb);
716 upper->level = lower->level + 1; 920 upper->level = lower->level + 1;
921 if (!root->ref_cows)
922 upper->cowonly = 1;
717 923
718 /* 924 /*
719 * if we know the block isn't shared 925 * if we know the block isn't shared
@@ -743,10 +949,12 @@ again:
743 rb_node); 949 rb_node);
744 BUG_ON(!upper->checked); 950 BUG_ON(!upper->checked);
745 INIT_LIST_HEAD(&edge->list[UPPER]); 951 INIT_LIST_HEAD(&edge->list[UPPER]);
952 if (!upper->owner)
953 upper->owner = btrfs_header_owner(eb);
746 } 954 }
747 list_add_tail(&edge->list[LOWER], &lower->upper); 955 list_add_tail(&edge->list[LOWER], &lower->upper);
748 edge->node[UPPER] = upper;
749 edge->node[LOWER] = lower; 956 edge->node[LOWER] = lower;
957 edge->node[UPPER] = upper;
750 958
751 if (rb_node) 959 if (rb_node)
752 break; 960 break;
@@ -784,8 +992,13 @@ next:
784 * into the cache. 992 * into the cache.
785 */ 993 */
786 BUG_ON(!node->checked); 994 BUG_ON(!node->checked);
787 rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node); 995 cowonly = node->cowonly;
788 BUG_ON(rb_node); 996 if (!cowonly) {
997 rb_node = tree_insert(&cache->rb_root, node->bytenr,
998 &node->rb_node);
999 BUG_ON(rb_node);
1000 list_add_tail(&node->lower, &cache->leaves);
1001 }
789 1002
790 list_for_each_entry(edge, &node->upper, list[LOWER]) 1003 list_for_each_entry(edge, &node->upper, list[LOWER])
791 list_add_tail(&edge->list[UPPER], &list); 1004 list_add_tail(&edge->list[UPPER], &list);
@@ -794,6 +1007,14 @@ next:
794 edge = list_entry(list.next, struct backref_edge, list[UPPER]); 1007 edge = list_entry(list.next, struct backref_edge, list[UPPER]);
795 list_del_init(&edge->list[UPPER]); 1008 list_del_init(&edge->list[UPPER]);
796 upper = edge->node[UPPER]; 1009 upper = edge->node[UPPER];
1010 if (upper->detached) {
1011 list_del(&edge->list[LOWER]);
1012 lower = edge->node[LOWER];
1013 free_backref_edge(cache, edge);
1014 if (list_empty(&lower->upper))
1015 list_add(&lower->list, &useless);
1016 continue;
1017 }
797 1018
798 if (!RB_EMPTY_NODE(&upper->rb_node)) { 1019 if (!RB_EMPTY_NODE(&upper->rb_node)) {
799 if (upper->lowest) { 1020 if (upper->lowest) {
@@ -806,25 +1027,69 @@ next:
806 } 1027 }
807 1028
808 BUG_ON(!upper->checked); 1029 BUG_ON(!upper->checked);
809 rb_node = tree_insert(&cache->rb_root, upper->bytenr, 1030 BUG_ON(cowonly != upper->cowonly);
810 &upper->rb_node); 1031 if (!cowonly) {
811 BUG_ON(rb_node); 1032 rb_node = tree_insert(&cache->rb_root, upper->bytenr,
1033 &upper->rb_node);
1034 BUG_ON(rb_node);
1035 }
812 1036
813 list_add_tail(&edge->list[UPPER], &upper->lower); 1037 list_add_tail(&edge->list[UPPER], &upper->lower);
814 1038
815 list_for_each_entry(edge, &upper->upper, list[LOWER]) 1039 list_for_each_entry(edge, &upper->upper, list[LOWER])
816 list_add_tail(&edge->list[UPPER], &list); 1040 list_add_tail(&edge->list[UPPER], &list);
817 } 1041 }
1042 /*
1043 * process useless backref nodes. backref nodes for tree leaves
1044 * are deleted from the cache. backref nodes for upper level
1045 * tree blocks are left in the cache to avoid unnecessary backref
1046 * lookup.
1047 */
1048 while (!list_empty(&useless)) {
1049 upper = list_entry(useless.next, struct backref_node, list);
1050 list_del_init(&upper->list);
1051 BUG_ON(!list_empty(&upper->upper));
1052 if (upper == node)
1053 node = NULL;
1054 if (upper->lowest) {
1055 list_del_init(&upper->lower);
1056 upper->lowest = 0;
1057 }
1058 while (!list_empty(&upper->lower)) {
1059 edge = list_entry(upper->lower.next,
1060 struct backref_edge, list[UPPER]);
1061 list_del(&edge->list[UPPER]);
1062 list_del(&edge->list[LOWER]);
1063 lower = edge->node[LOWER];
1064 free_backref_edge(cache, edge);
1065
1066 if (list_empty(&lower->upper))
1067 list_add(&lower->list, &useless);
1068 }
1069 __mark_block_processed(rc, upper);
1070 if (upper->level > 0) {
1071 list_add(&upper->list, &cache->detached);
1072 upper->detached = 1;
1073 } else {
1074 rb_erase(&upper->rb_node, &cache->rb_root);
1075 free_backref_node(cache, upper);
1076 }
1077 }
818out: 1078out:
819 btrfs_free_path(path1); 1079 btrfs_free_path(path1);
820 btrfs_free_path(path2); 1080 btrfs_free_path(path2);
821 if (err) { 1081 if (err) {
822 INIT_LIST_HEAD(&list); 1082 while (!list_empty(&useless)) {
1083 lower = list_entry(useless.next,
1084 struct backref_node, upper);
1085 list_del_init(&lower->upper);
1086 }
823 upper = node; 1087 upper = node;
1088 INIT_LIST_HEAD(&list);
824 while (upper) { 1089 while (upper) {
825 if (RB_EMPTY_NODE(&upper->rb_node)) { 1090 if (RB_EMPTY_NODE(&upper->rb_node)) {
826 list_splice_tail(&upper->upper, &list); 1091 list_splice_tail(&upper->upper, &list);
827 kfree(upper); 1092 free_backref_node(cache, upper);
828 } 1093 }
829 1094
830 if (list_empty(&list)) 1095 if (list_empty(&list))
@@ -832,15 +1097,104 @@ out:
832 1097
833 edge = list_entry(list.next, struct backref_edge, 1098 edge = list_entry(list.next, struct backref_edge,
834 list[LOWER]); 1099 list[LOWER]);
1100 list_del(&edge->list[LOWER]);
835 upper = edge->node[UPPER]; 1101 upper = edge->node[UPPER];
836 kfree(edge); 1102 free_backref_edge(cache, edge);
837 } 1103 }
838 return ERR_PTR(err); 1104 return ERR_PTR(err);
839 } 1105 }
1106 BUG_ON(node && node->detached);
840 return node; 1107 return node;
841} 1108}
842 1109
843/* 1110/*
1111 * helper to add backref node for the newly created snapshot.
1112 * the backref node is created by cloning backref node that
1113 * corresponds to root of source tree
1114 */
1115static int clone_backref_node(struct btrfs_trans_handle *trans,
1116 struct reloc_control *rc,
1117 struct btrfs_root *src,
1118 struct btrfs_root *dest)
1119{
1120 struct btrfs_root *reloc_root = src->reloc_root;
1121 struct backref_cache *cache = &rc->backref_cache;
1122 struct backref_node *node = NULL;
1123 struct backref_node *new_node;
1124 struct backref_edge *edge;
1125 struct backref_edge *new_edge;
1126 struct rb_node *rb_node;
1127
1128 if (cache->last_trans > 0)
1129 update_backref_cache(trans, cache);
1130
1131 rb_node = tree_search(&cache->rb_root, src->commit_root->start);
1132 if (rb_node) {
1133 node = rb_entry(rb_node, struct backref_node, rb_node);
1134 if (node->detached)
1135 node = NULL;
1136 else
1137 BUG_ON(node->new_bytenr != reloc_root->node->start);
1138 }
1139
1140 if (!node) {
1141 rb_node = tree_search(&cache->rb_root,
1142 reloc_root->commit_root->start);
1143 if (rb_node) {
1144 node = rb_entry(rb_node, struct backref_node,
1145 rb_node);
1146 BUG_ON(node->detached);
1147 }
1148 }
1149
1150 if (!node)
1151 return 0;
1152
1153 new_node = alloc_backref_node(cache);
1154 if (!new_node)
1155 return -ENOMEM;
1156
1157 new_node->bytenr = dest->node->start;
1158 new_node->level = node->level;
1159 new_node->lowest = node->lowest;
1160 new_node->root = dest;
1161
1162 if (!node->lowest) {
1163 list_for_each_entry(edge, &node->lower, list[UPPER]) {
1164 new_edge = alloc_backref_edge(cache);
1165 if (!new_edge)
1166 goto fail;
1167
1168 new_edge->node[UPPER] = new_node;
1169 new_edge->node[LOWER] = edge->node[LOWER];
1170 list_add_tail(&new_edge->list[UPPER],
1171 &new_node->lower);
1172 }
1173 }
1174
1175 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
1176 &new_node->rb_node);
1177 BUG_ON(rb_node);
1178
1179 if (!new_node->lowest) {
1180 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
1181 list_add_tail(&new_edge->list[LOWER],
1182 &new_edge->node[LOWER]->upper);
1183 }
1184 }
1185 return 0;
1186fail:
1187 while (!list_empty(&new_node->lower)) {
1188 new_edge = list_entry(new_node->lower.next,
1189 struct backref_edge, list[UPPER]);
1190 list_del(&new_edge->list[UPPER]);
1191 free_backref_edge(cache, new_edge);
1192 }
1193 free_backref_node(cache, new_node);
1194 return -ENOMEM;
1195}
1196
1197/*
844 * helper to add 'address of tree root -> reloc tree' mapping 1198 * helper to add 'address of tree root -> reloc tree' mapping
845 */ 1199 */
846static int __add_reloc_root(struct btrfs_root *root) 1200static int __add_reloc_root(struct btrfs_root *root)
@@ -900,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
900 return 0; 1254 return 0;
901} 1255}
902 1256
903/* 1257static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
904 * create reloc tree for a given fs tree. reloc tree is just a 1258 struct btrfs_root *root, u64 objectid)
905 * snapshot of the fs tree with special root objectid.
906 */
907int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
908 struct btrfs_root *root)
909{ 1259{
910 struct btrfs_root *reloc_root; 1260 struct btrfs_root *reloc_root;
911 struct extent_buffer *eb; 1261 struct extent_buffer *eb;
@@ -913,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
913 struct btrfs_key root_key; 1263 struct btrfs_key root_key;
914 int ret; 1264 int ret;
915 1265
916 if (root->reloc_root) {
917 reloc_root = root->reloc_root;
918 reloc_root->last_trans = trans->transid;
919 return 0;
920 }
921
922 if (!root->fs_info->reloc_ctl ||
923 !root->fs_info->reloc_ctl->create_reloc_root ||
924 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
925 return 0;
926
927 root_item = kmalloc(sizeof(*root_item), GFP_NOFS); 1266 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
928 BUG_ON(!root_item); 1267 BUG_ON(!root_item);
929 1268
930 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; 1269 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
931 root_key.type = BTRFS_ROOT_ITEM_KEY; 1270 root_key.type = BTRFS_ROOT_ITEM_KEY;
932 root_key.offset = root->root_key.objectid; 1271 root_key.offset = objectid;
933 1272
934 ret = btrfs_copy_root(trans, root, root->commit_root, &eb, 1273 if (root->root_key.objectid == objectid) {
935 BTRFS_TREE_RELOC_OBJECTID); 1274 /* called by btrfs_init_reloc_root */
936 BUG_ON(ret); 1275 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
1276 BTRFS_TREE_RELOC_OBJECTID);
1277 BUG_ON(ret);
1278
1279 btrfs_set_root_last_snapshot(&root->root_item,
1280 trans->transid - 1);
1281 } else {
1282 /*
1283 * called by btrfs_reloc_post_snapshot_hook.
1284 * the source tree is a reloc tree, all tree blocks
1285 * modified after it was created have RELOC flag
1286 * set in their headers. so it's OK to not update
1287 * the 'last_snapshot'.
1288 */
1289 ret = btrfs_copy_root(trans, root, root->node, &eb,
1290 BTRFS_TREE_RELOC_OBJECTID);
1291 BUG_ON(ret);
1292 }
937 1293
938 btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
939 memcpy(root_item, &root->root_item, sizeof(*root_item)); 1294 memcpy(root_item, &root->root_item, sizeof(*root_item));
940 btrfs_set_root_refs(root_item, 1);
941 btrfs_set_root_bytenr(root_item, eb->start); 1295 btrfs_set_root_bytenr(root_item, eb->start);
942 btrfs_set_root_level(root_item, btrfs_header_level(eb)); 1296 btrfs_set_root_level(root_item, btrfs_header_level(eb));
943 btrfs_set_root_generation(root_item, trans->transid); 1297 btrfs_set_root_generation(root_item, trans->transid);
944 memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key)); 1298
945 root_item->drop_level = 0; 1299 if (root->root_key.objectid == objectid) {
1300 btrfs_set_root_refs(root_item, 0);
1301 memset(&root_item->drop_progress, 0,
1302 sizeof(struct btrfs_disk_key));
1303 root_item->drop_level = 0;
1304 }
946 1305
947 btrfs_tree_unlock(eb); 1306 btrfs_tree_unlock(eb);
948 free_extent_buffer(eb); 1307 free_extent_buffer(eb);
@@ -956,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
956 &root_key); 1315 &root_key);
957 BUG_ON(IS_ERR(reloc_root)); 1316 BUG_ON(IS_ERR(reloc_root));
958 reloc_root->last_trans = trans->transid; 1317 reloc_root->last_trans = trans->transid;
1318 return reloc_root;
1319}
1320
1321/*
1322 * create reloc tree for a given fs tree. reloc tree is just a
1323 * snapshot of the fs tree with special root objectid.
1324 */
1325int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
1326 struct btrfs_root *root)
1327{
1328 struct btrfs_root *reloc_root;
1329 struct reloc_control *rc = root->fs_info->reloc_ctl;
1330 int clear_rsv = 0;
1331
1332 if (root->reloc_root) {
1333 reloc_root = root->reloc_root;
1334 reloc_root->last_trans = trans->transid;
1335 return 0;
1336 }
1337
1338 if (!rc || !rc->create_reloc_tree ||
1339 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1340 return 0;
1341
1342 if (!trans->block_rsv) {
1343 trans->block_rsv = rc->block_rsv;
1344 clear_rsv = 1;
1345 }
1346 reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
1347 if (clear_rsv)
1348 trans->block_rsv = NULL;
959 1349
960 __add_reloc_root(reloc_root); 1350 __add_reloc_root(reloc_root);
961 root->reloc_root = reloc_root; 1351 root->reloc_root = reloc_root;
@@ -979,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
979 reloc_root = root->reloc_root; 1369 reloc_root = root->reloc_root;
980 root_item = &reloc_root->root_item; 1370 root_item = &reloc_root->root_item;
981 1371
982 if (btrfs_root_refs(root_item) == 0) { 1372 if (root->fs_info->reloc_ctl->merge_reloc_tree &&
1373 btrfs_root_refs(root_item) == 0) {
983 root->reloc_root = NULL; 1374 root->reloc_root = NULL;
984 del = 1; 1375 del = 1;
985 } 1376 }
@@ -1101,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1101 goto out; 1492 goto out;
1102 } 1493 }
1103 1494
1104 if (new_bytenr) 1495 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1105 *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1106 ret = 0; 1496 ret = 0;
1107out: 1497out:
1108 btrfs_free_path(path); 1498 btrfs_free_path(path);
@@ -1113,19 +1503,18 @@ out:
1113 * update file extent items in the tree leaf to point to 1503 * update file extent items in the tree leaf to point to
1114 * the new locations. 1504 * the new locations.
1115 */ 1505 */
1116static int replace_file_extents(struct btrfs_trans_handle *trans, 1506static noinline_for_stack
1117 struct reloc_control *rc, 1507int replace_file_extents(struct btrfs_trans_handle *trans,
1118 struct btrfs_root *root, 1508 struct reloc_control *rc,
1119 struct extent_buffer *leaf, 1509 struct btrfs_root *root,
1120 struct list_head *inode_list) 1510 struct extent_buffer *leaf)
1121{ 1511{
1122 struct btrfs_key key; 1512 struct btrfs_key key;
1123 struct btrfs_file_extent_item *fi; 1513 struct btrfs_file_extent_item *fi;
1124 struct inode *inode = NULL; 1514 struct inode *inode = NULL;
1125 struct inodevec *ivec = NULL;
1126 u64 parent; 1515 u64 parent;
1127 u64 bytenr; 1516 u64 bytenr;
1128 u64 new_bytenr; 1517 u64 new_bytenr = 0;
1129 u64 num_bytes; 1518 u64 num_bytes;
1130 u64 end; 1519 u64 end;
1131 u32 nritems; 1520 u32 nritems;
@@ -1165,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1165 * to complete and drop the extent cache 1554 * to complete and drop the extent cache
1166 */ 1555 */
1167 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { 1556 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1168 if (!ivec || ivec->nr == INODEVEC_SIZE) {
1169 ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
1170 BUG_ON(!ivec);
1171 ivec->nr = 0;
1172 list_add_tail(&ivec->list, inode_list);
1173 }
1174 if (first) { 1557 if (first) {
1175 inode = find_next_inode(root, key.objectid); 1558 inode = find_next_inode(root, key.objectid);
1176 if (inode)
1177 ivec->inode[ivec->nr++] = inode;
1178 first = 0; 1559 first = 0;
1179 } else if (inode && inode->i_ino < key.objectid) { 1560 } else if (inode && inode->i_ino < key.objectid) {
1561 btrfs_add_delayed_iput(inode);
1180 inode = find_next_inode(root, key.objectid); 1562 inode = find_next_inode(root, key.objectid);
1181 if (inode)
1182 ivec->inode[ivec->nr++] = inode;
1183 } 1563 }
1184 if (inode && inode->i_ino == key.objectid) { 1564 if (inode && inode->i_ino == key.objectid) {
1185 end = key.offset + 1565 end = key.offset +
@@ -1203,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1203 1583
1204 ret = get_new_location(rc->data_inode, &new_bytenr, 1584 ret = get_new_location(rc->data_inode, &new_bytenr,
1205 bytenr, num_bytes); 1585 bytenr, num_bytes);
1206 if (ret > 0) 1586 if (ret > 0) {
1587 WARN_ON(1);
1207 continue; 1588 continue;
1589 }
1208 BUG_ON(ret < 0); 1590 BUG_ON(ret < 0);
1209 1591
1210 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr); 1592 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1224,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
1224 } 1606 }
1225 if (dirty) 1607 if (dirty)
1226 btrfs_mark_buffer_dirty(leaf); 1608 btrfs_mark_buffer_dirty(leaf);
1609 if (inode)
1610 btrfs_add_delayed_iput(inode);
1227 return 0; 1611 return 0;
1228} 1612}
1229 1613
@@ -1247,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
1247 * if no block got replaced, 0 is returned. if there are other 1631 * if no block got replaced, 0 is returned. if there are other
1248 * errors, a negative error number is returned. 1632 * errors, a negative error number is returned.
1249 */ 1633 */
1250static int replace_path(struct btrfs_trans_handle *trans, 1634static noinline_for_stack
1251 struct btrfs_root *dest, struct btrfs_root *src, 1635int replace_path(struct btrfs_trans_handle *trans,
1252 struct btrfs_path *path, struct btrfs_key *next_key, 1636 struct btrfs_root *dest, struct btrfs_root *src,
1253 struct extent_buffer **leaf, 1637 struct btrfs_path *path, struct btrfs_key *next_key,
1254 int lowest_level, int max_level) 1638 int lowest_level, int max_level)
1255{ 1639{
1256 struct extent_buffer *eb; 1640 struct extent_buffer *eb;
1257 struct extent_buffer *parent; 1641 struct extent_buffer *parent;
@@ -1262,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
1262 u64 new_ptr_gen; 1646 u64 new_ptr_gen;
1263 u64 last_snapshot; 1647 u64 last_snapshot;
1264 u32 blocksize; 1648 u32 blocksize;
1649 int cow = 0;
1265 int level; 1650 int level;
1266 int ret; 1651 int ret;
1267 int slot; 1652 int slot;
1268 1653
1269 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 1654 BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1270 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); 1655 BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1271 BUG_ON(lowest_level > 1 && leaf);
1272 1656
1273 last_snapshot = btrfs_root_last_snapshot(&src->root_item); 1657 last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1274 1658again:
1275 slot = path->slots[lowest_level]; 1659 slot = path->slots[lowest_level];
1276 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot); 1660 btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1277 1661
@@ -1285,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
1285 return 0; 1669 return 0;
1286 } 1670 }
1287 1671
1288 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); 1672 if (cow) {
1289 BUG_ON(ret); 1673 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
1674 BUG_ON(ret);
1675 }
1290 btrfs_set_lock_blocking(eb); 1676 btrfs_set_lock_blocking(eb);
1291 1677
1292 if (next_key) { 1678 if (next_key) {
@@ -1330,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
1330 1716
1331 if (new_bytenr == 0 || old_ptr_gen > last_snapshot || 1717 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1332 memcmp_node_keys(parent, slot, path, level)) { 1718 memcmp_node_keys(parent, slot, path, level)) {
1333 if (level <= lowest_level && !leaf) { 1719 if (level <= lowest_level) {
1334 ret = 0; 1720 ret = 0;
1335 break; 1721 break;
1336 } 1722 }
@@ -1338,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
1338 eb = read_tree_block(dest, old_bytenr, blocksize, 1724 eb = read_tree_block(dest, old_bytenr, blocksize,
1339 old_ptr_gen); 1725 old_ptr_gen);
1340 btrfs_tree_lock(eb); 1726 btrfs_tree_lock(eb);
1341 ret = btrfs_cow_block(trans, dest, eb, parent, 1727 if (cow) {
1342 slot, &eb); 1728 ret = btrfs_cow_block(trans, dest, eb, parent,
1343 BUG_ON(ret); 1729 slot, &eb);
1344 btrfs_set_lock_blocking(eb); 1730 BUG_ON(ret);
1345
1346 if (level <= lowest_level) {
1347 *leaf = eb;
1348 ret = 0;
1349 break;
1350 } 1731 }
1732 btrfs_set_lock_blocking(eb);
1351 1733
1352 btrfs_tree_unlock(parent); 1734 btrfs_tree_unlock(parent);
1353 free_extent_buffer(parent); 1735 free_extent_buffer(parent);
@@ -1356,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
1356 continue; 1738 continue;
1357 } 1739 }
1358 1740
1741 if (!cow) {
1742 btrfs_tree_unlock(parent);
1743 free_extent_buffer(parent);
1744 cow = 1;
1745 goto again;
1746 }
1747
1359 btrfs_node_key_to_cpu(path->nodes[level], &key, 1748 btrfs_node_key_to_cpu(path->nodes[level], &key,
1360 path->slots[level]); 1749 path->slots[level]);
1361 btrfs_release_path(src, path); 1750 btrfs_release_path(src, path);
@@ -1561,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1561 return 0; 1950 return 0;
1562} 1951}
1563 1952
1564static void put_inodes(struct list_head *list)
1565{
1566 struct inodevec *ivec;
1567 while (!list_empty(list)) {
1568 ivec = list_entry(list->next, struct inodevec, list);
1569 list_del(&ivec->list);
1570 while (ivec->nr > 0) {
1571 ivec->nr--;
1572 iput(ivec->inode[ivec->nr]);
1573 }
1574 kfree(ivec);
1575 }
1576}
1577
1578static int find_next_key(struct btrfs_path *path, int level, 1953static int find_next_key(struct btrfs_path *path, int level,
1579 struct btrfs_key *key) 1954 struct btrfs_key *key)
1580 1955
@@ -1607,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1607 struct btrfs_root *reloc_root; 1982 struct btrfs_root *reloc_root;
1608 struct btrfs_root_item *root_item; 1983 struct btrfs_root_item *root_item;
1609 struct btrfs_path *path; 1984 struct btrfs_path *path;
1610 struct extent_buffer *leaf = NULL; 1985 struct extent_buffer *leaf;
1611 unsigned long nr; 1986 unsigned long nr;
1612 int level; 1987 int level;
1613 int max_level; 1988 int max_level;
1614 int replaced = 0; 1989 int replaced = 0;
1615 int ret; 1990 int ret;
1616 int err = 0; 1991 int err = 0;
1992 u32 min_reserved;
1617 1993
1618 path = btrfs_alloc_path(); 1994 path = btrfs_alloc_path();
1619 if (!path) 1995 if (!path)
@@ -1647,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1647 btrfs_unlock_up_safe(path, 0); 2023 btrfs_unlock_up_safe(path, 0);
1648 } 2024 }
1649 2025
1650 if (level == 0 && rc->stage == UPDATE_DATA_PTRS) { 2026 min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1651 trans = btrfs_start_transaction(root, 1); 2027 memset(&next_key, 0, sizeof(next_key));
1652 2028
1653 leaf = path->nodes[0]; 2029 while (1) {
1654 btrfs_item_key_to_cpu(leaf, &key, 0); 2030 trans = btrfs_start_transaction(root, 0);
1655 btrfs_release_path(reloc_root, path); 2031 trans->block_rsv = rc->block_rsv;
1656 2032
1657 ret = btrfs_search_slot(trans, root, &key, path, 0, 1); 2033 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
1658 if (ret < 0) { 2034 min_reserved, 0);
1659 err = ret; 2035 if (ret) {
1660 goto out; 2036 BUG_ON(ret != -EAGAIN);
2037 ret = btrfs_commit_transaction(trans, root);
2038 BUG_ON(ret);
2039 continue;
1661 } 2040 }
1662 2041
1663 leaf = path->nodes[0];
1664 btrfs_unlock_up_safe(path, 1);
1665 ret = replace_file_extents(trans, rc, root, leaf,
1666 &inode_list);
1667 if (ret < 0)
1668 err = ret;
1669 goto out;
1670 }
1671
1672 memset(&next_key, 0, sizeof(next_key));
1673
1674 while (1) {
1675 leaf = NULL;
1676 replaced = 0; 2042 replaced = 0;
1677 trans = btrfs_start_transaction(root, 1);
1678 max_level = level; 2043 max_level = level;
1679 2044
1680 ret = walk_down_reloc_tree(reloc_root, path, &level); 2045 ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1688,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1688 if (!find_next_key(path, level, &key) && 2053 if (!find_next_key(path, level, &key) &&
1689 btrfs_comp_cpu_keys(&next_key, &key) >= 0) { 2054 btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1690 ret = 0; 2055 ret = 0;
1691 } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
1692 ret = replace_path(trans, root, reloc_root,
1693 path, &next_key, &leaf,
1694 level, max_level);
1695 } else { 2056 } else {
1696 ret = replace_path(trans, root, reloc_root, 2057 ret = replace_path(trans, root, reloc_root, path,
1697 path, &next_key, NULL, 2058 &next_key, level, max_level);
1698 level, max_level);
1699 } 2059 }
1700 if (ret < 0) { 2060 if (ret < 0) {
1701 err = ret; 2061 err = ret;
@@ -1707,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1707 btrfs_node_key_to_cpu(path->nodes[level], &key, 2067 btrfs_node_key_to_cpu(path->nodes[level], &key,
1708 path->slots[level]); 2068 path->slots[level]);
1709 replaced = 1; 2069 replaced = 1;
1710 } else if (leaf) {
1711 /*
1712 * no block got replaced, try replacing file extents
1713 */
1714 btrfs_item_key_to_cpu(leaf, &key, 0);
1715 ret = replace_file_extents(trans, rc, root, leaf,
1716 &inode_list);
1717 btrfs_tree_unlock(leaf);
1718 free_extent_buffer(leaf);
1719 BUG_ON(ret < 0);
1720 } 2070 }
1721 2071
1722 ret = walk_up_reloc_tree(reloc_root, path, &level); 2072 ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1733,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1733 root_item->drop_level = level; 2083 root_item->drop_level = level;
1734 2084
1735 nr = trans->blocks_used; 2085 nr = trans->blocks_used;
1736 btrfs_end_transaction(trans, root); 2086 btrfs_end_transaction_throttle(trans, root);
1737 2087
1738 btrfs_btree_balance_dirty(root, nr); 2088 btrfs_btree_balance_dirty(root, nr);
1739 2089
1740 /*
1741 * put inodes outside transaction, otherwise we may deadlock.
1742 */
1743 put_inodes(&inode_list);
1744
1745 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2090 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1746 invalidate_extent_cache(root, &key, &next_key); 2091 invalidate_extent_cache(root, &key, &next_key);
1747 } 2092 }
@@ -1764,87 +2109,123 @@ out:
1764 sizeof(root_item->drop_progress)); 2109 sizeof(root_item->drop_progress));
1765 root_item->drop_level = 0; 2110 root_item->drop_level = 0;
1766 btrfs_set_root_refs(root_item, 0); 2111 btrfs_set_root_refs(root_item, 0);
2112 btrfs_update_reloc_root(trans, root);
1767 } 2113 }
1768 2114
1769 nr = trans->blocks_used; 2115 nr = trans->blocks_used;
1770 btrfs_end_transaction(trans, root); 2116 btrfs_end_transaction_throttle(trans, root);
1771 2117
1772 btrfs_btree_balance_dirty(root, nr); 2118 btrfs_btree_balance_dirty(root, nr);
1773 2119
1774 put_inodes(&inode_list);
1775
1776 if (replaced && rc->stage == UPDATE_DATA_PTRS) 2120 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1777 invalidate_extent_cache(root, &key, &next_key); 2121 invalidate_extent_cache(root, &key, &next_key);
1778 2122
1779 return err; 2123 return err;
1780} 2124}
1781 2125
1782/* 2126static noinline_for_stack
1783 * callback for the work threads. 2127int prepare_to_merge(struct reloc_control *rc, int err)
1784 * this function merges reloc tree with corresponding fs tree,
1785 * and then drops the reloc tree.
1786 */
1787static void merge_func(struct btrfs_work *work)
1788{ 2128{
1789 struct btrfs_trans_handle *trans; 2129 struct btrfs_root *root = rc->extent_root;
1790 struct btrfs_root *root;
1791 struct btrfs_root *reloc_root; 2130 struct btrfs_root *reloc_root;
1792 struct async_merge *async; 2131 struct btrfs_trans_handle *trans;
2132 LIST_HEAD(reloc_roots);
2133 u64 num_bytes = 0;
2134 int ret;
2135
2136 mutex_lock(&root->fs_info->trans_mutex);
2137 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2138 rc->merging_rsv_size += rc->nodes_relocated * 2;
2139 mutex_unlock(&root->fs_info->trans_mutex);
2140again:
2141 if (!err) {
2142 num_bytes = rc->merging_rsv_size;
2143 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2144 num_bytes);
2145 if (ret)
2146 err = ret;
2147 }
1793 2148
1794 async = container_of(work, struct async_merge, work); 2149 trans = btrfs_join_transaction(rc->extent_root, 1);
1795 reloc_root = async->root; 2150
2151 if (!err) {
2152 if (num_bytes != rc->merging_rsv_size) {
2153 btrfs_end_transaction(trans, rc->extent_root);
2154 btrfs_block_rsv_release(rc->extent_root,
2155 rc->block_rsv, num_bytes);
2156 goto again;
2157 }
2158 }
2159
2160 rc->merge_reloc_tree = 1;
2161
2162 while (!list_empty(&rc->reloc_roots)) {
2163 reloc_root = list_entry(rc->reloc_roots.next,
2164 struct btrfs_root, root_list);
2165 list_del_init(&reloc_root->root_list);
1796 2166
1797 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1798 root = read_fs_root(reloc_root->fs_info, 2167 root = read_fs_root(reloc_root->fs_info,
1799 reloc_root->root_key.offset); 2168 reloc_root->root_key.offset);
1800 BUG_ON(IS_ERR(root)); 2169 BUG_ON(IS_ERR(root));
1801 BUG_ON(root->reloc_root != reloc_root); 2170 BUG_ON(root->reloc_root != reloc_root);
1802 2171
1803 merge_reloc_root(async->rc, root); 2172 /*
1804 2173 * set reference count to 1, so btrfs_recover_relocation
1805 trans = btrfs_start_transaction(root, 1); 2174 * knows it should resumes merging
2175 */
2176 if (!err)
2177 btrfs_set_root_refs(&reloc_root->root_item, 1);
1806 btrfs_update_reloc_root(trans, root); 2178 btrfs_update_reloc_root(trans, root);
1807 btrfs_end_transaction(trans, root);
1808 }
1809 2179
1810 btrfs_drop_snapshot(reloc_root, 0); 2180 list_add(&reloc_root->root_list, &reloc_roots);
2181 }
1811 2182
1812 if (atomic_dec_and_test(async->num_pending)) 2183 list_splice(&reloc_roots, &rc->reloc_roots);
1813 complete(async->done);
1814 2184
1815 kfree(async); 2185 if (!err)
2186 btrfs_commit_transaction(trans, rc->extent_root);
2187 else
2188 btrfs_end_transaction(trans, rc->extent_root);
2189 return err;
1816} 2190}
1817 2191
1818static int merge_reloc_roots(struct reloc_control *rc) 2192static noinline_for_stack
2193int merge_reloc_roots(struct reloc_control *rc)
1819{ 2194{
1820 struct async_merge *async;
1821 struct btrfs_root *root; 2195 struct btrfs_root *root;
1822 struct completion done; 2196 struct btrfs_root *reloc_root;
1823 atomic_t num_pending; 2197 LIST_HEAD(reloc_roots);
2198 int found = 0;
2199 int ret;
2200again:
2201 root = rc->extent_root;
2202 mutex_lock(&root->fs_info->trans_mutex);
2203 list_splice_init(&rc->reloc_roots, &reloc_roots);
2204 mutex_unlock(&root->fs_info->trans_mutex);
1824 2205
1825 init_completion(&done); 2206 while (!list_empty(&reloc_roots)) {
1826 atomic_set(&num_pending, 1); 2207 found = 1;
2208 reloc_root = list_entry(reloc_roots.next,
2209 struct btrfs_root, root_list);
1827 2210
1828 while (!list_empty(&rc->reloc_roots)) { 2211 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
1829 root = list_entry(rc->reloc_roots.next, 2212 root = read_fs_root(reloc_root->fs_info,
1830 struct btrfs_root, root_list); 2213 reloc_root->root_key.offset);
1831 list_del_init(&root->root_list); 2214 BUG_ON(IS_ERR(root));
2215 BUG_ON(root->reloc_root != reloc_root);
1832 2216
1833 async = kmalloc(sizeof(*async), GFP_NOFS); 2217 ret = merge_reloc_root(rc, root);
1834 BUG_ON(!async); 2218 BUG_ON(ret);
1835 async->work.func = merge_func; 2219 } else {
1836 async->work.flags = 0; 2220 list_del_init(&reloc_root->root_list);
1837 async->rc = rc; 2221 }
1838 async->root = root; 2222 btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
1839 async->done = &done;
1840 async->num_pending = &num_pending;
1841 atomic_inc(&num_pending);
1842 btrfs_queue_worker(&rc->workers, &async->work);
1843 } 2223 }
1844 2224
1845 if (!atomic_dec_and_test(&num_pending)) 2225 if (found) {
1846 wait_for_completion(&done); 2226 found = 0;
1847 2227 goto again;
2228 }
1848 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); 2229 BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
1849 return 0; 2230 return 0;
1850} 2231}
@@ -1875,119 +2256,167 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
1875 return btrfs_record_root_in_trans(trans, root); 2256 return btrfs_record_root_in_trans(trans, root);
1876} 2257}
1877 2258
1878/* 2259static noinline_for_stack
1879 * select one tree from trees that references the block. 2260struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
1880 * for blocks in refernce counted trees, we preper reloc tree. 2261 struct reloc_control *rc,
1881 * if no reloc tree found and reloc_only is true, NULL is returned. 2262 struct backref_node *node,
1882 */ 2263 struct backref_edge *edges[], int *nr)
1883static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
1884 struct backref_node *node,
1885 struct backref_edge *edges[],
1886 int *nr, int reloc_only)
1887{ 2264{
1888 struct backref_node *next; 2265 struct backref_node *next;
1889 struct btrfs_root *root; 2266 struct btrfs_root *root;
1890 int index; 2267 int index = 0;
1891 int loop = 0; 2268
1892again:
1893 index = 0;
1894 next = node; 2269 next = node;
1895 while (1) { 2270 while (1) {
1896 cond_resched(); 2271 cond_resched();
1897 next = walk_up_backref(next, edges, &index); 2272 next = walk_up_backref(next, edges, &index);
1898 root = next->root; 2273 root = next->root;
1899 if (!root) { 2274 BUG_ON(!root);
1900 BUG_ON(!node->old_root); 2275 BUG_ON(!root->ref_cows);
1901 goto skip;
1902 }
1903
1904 /* no other choice for non-refernce counted tree */
1905 if (!root->ref_cows) {
1906 BUG_ON(reloc_only);
1907 break;
1908 }
1909 2276
1910 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2277 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1911 record_reloc_root_in_trans(trans, root); 2278 record_reloc_root_in_trans(trans, root);
1912 break; 2279 break;
1913 } 2280 }
1914 2281
1915 if (loop) { 2282 btrfs_record_root_in_trans(trans, root);
1916 btrfs_record_root_in_trans(trans, root); 2283 root = root->reloc_root;
2284
2285 if (next->new_bytenr != root->node->start) {
2286 BUG_ON(next->new_bytenr);
2287 BUG_ON(!list_empty(&next->list));
2288 next->new_bytenr = root->node->start;
2289 next->root = root;
2290 list_add_tail(&next->list,
2291 &rc->backref_cache.changed);
2292 __mark_block_processed(rc, next);
1917 break; 2293 break;
1918 } 2294 }
1919 2295
1920 if (reloc_only || next != node) { 2296 WARN_ON(1);
1921 if (!root->reloc_root)
1922 btrfs_record_root_in_trans(trans, root);
1923 root = root->reloc_root;
1924 /*
1925 * if the reloc tree was created in current
1926 * transation, there is no node in backref tree
1927 * corresponds to the root of the reloc tree.
1928 */
1929 if (btrfs_root_last_snapshot(&root->root_item) ==
1930 trans->transid - 1)
1931 break;
1932 }
1933skip:
1934 root = NULL; 2297 root = NULL;
1935 next = walk_down_backref(edges, &index); 2298 next = walk_down_backref(edges, &index);
1936 if (!next || next->level <= node->level) 2299 if (!next || next->level <= node->level)
1937 break; 2300 break;
1938 } 2301 }
2302 if (!root)
2303 return NULL;
1939 2304
1940 if (!root && !loop && !reloc_only) { 2305 *nr = index;
1941 loop = 1; 2306 next = node;
1942 goto again; 2307 /* setup backref node path for btrfs_reloc_cow_block */
2308 while (1) {
2309 rc->backref_cache.path[next->level] = next;
2310 if (--index < 0)
2311 break;
2312 next = edges[index]->node[UPPER];
1943 } 2313 }
1944
1945 if (root)
1946 *nr = index;
1947 else
1948 *nr = 0;
1949
1950 return root; 2314 return root;
1951} 2315}
1952 2316
2317/*
2318 * select a tree root for relocation. return NULL if the block
2319 * is reference counted. we should use do_relocation() in this
2320 * case. return a tree root pointer if the block isn't reference
2321 * counted. return -ENOENT if the block is root of reloc tree.
2322 */
1953static noinline_for_stack 2323static noinline_for_stack
1954struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans, 2324struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
1955 struct backref_node *node) 2325 struct backref_node *node)
1956{ 2326{
2327 struct backref_node *next;
2328 struct btrfs_root *root;
2329 struct btrfs_root *fs_root = NULL;
1957 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1]; 2330 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
1958 int nr; 2331 int index = 0;
1959 return __select_one_root(trans, node, edges, &nr, 0); 2332
2333 next = node;
2334 while (1) {
2335 cond_resched();
2336 next = walk_up_backref(next, edges, &index);
2337 root = next->root;
2338 BUG_ON(!root);
2339
2340 /* no other choice for non-refernce counted tree */
2341 if (!root->ref_cows)
2342 return root;
2343
2344 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2345 fs_root = root;
2346
2347 if (next != node)
2348 return NULL;
2349
2350 next = walk_down_backref(edges, &index);
2351 if (!next || next->level <= node->level)
2352 break;
2353 }
2354
2355 if (!fs_root)
2356 return ERR_PTR(-ENOENT);
2357 return fs_root;
1960} 2358}
1961 2359
1962static noinline_for_stack 2360static noinline_for_stack
1963struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, 2361u64 calcu_metadata_size(struct reloc_control *rc,
1964 struct backref_node *node, 2362 struct backref_node *node, int reserve)
1965 struct backref_edge *edges[], int *nr)
1966{ 2363{
1967 return __select_one_root(trans, node, edges, nr, 1); 2364 struct backref_node *next = node;
2365 struct backref_edge *edge;
2366 struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2367 u64 num_bytes = 0;
2368 int index = 0;
2369
2370 BUG_ON(reserve && node->processed);
2371
2372 while (next) {
2373 cond_resched();
2374 while (1) {
2375 if (next->processed && (reserve || next != node))
2376 break;
2377
2378 num_bytes += btrfs_level_size(rc->extent_root,
2379 next->level);
2380
2381 if (list_empty(&next->upper))
2382 break;
2383
2384 edge = list_entry(next->upper.next,
2385 struct backref_edge, list[LOWER]);
2386 edges[index++] = edge;
2387 next = edge->node[UPPER];
2388 }
2389 next = walk_down_backref(edges, &index);
2390 }
2391 return num_bytes;
1968} 2392}
1969 2393
1970static void grab_path_buffers(struct btrfs_path *path, 2394static int reserve_metadata_space(struct btrfs_trans_handle *trans,
1971 struct backref_node *node, 2395 struct reloc_control *rc,
1972 struct backref_edge *edges[], int nr) 2396 struct backref_node *node)
1973{ 2397{
1974 int i = 0; 2398 struct btrfs_root *root = rc->extent_root;
1975 while (1) { 2399 u64 num_bytes;
1976 drop_node_buffer(node); 2400 int ret;
1977 node->eb = path->nodes[node->level];
1978 BUG_ON(!node->eb);
1979 if (path->locks[node->level])
1980 node->locked = 1;
1981 path->nodes[node->level] = NULL;
1982 path->locks[node->level] = 0;
1983
1984 if (i >= nr)
1985 break;
1986 2401
1987 edges[i]->blockptr = node->eb->start; 2402 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
1988 node = edges[i]->node[UPPER]; 2403
1989 i++; 2404 trans->block_rsv = rc->block_rsv;
2405 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
2406 if (ret) {
2407 if (ret == -EAGAIN)
2408 rc->commit_transaction = 1;
2409 return ret;
1990 } 2410 }
2411
2412 return 0;
2413}
2414
2415static void release_metadata_space(struct reloc_control *rc,
2416 struct backref_node *node)
2417{
2418 u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
2419 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
1991} 2420}
1992 2421
1993/* 2422/*
@@ -1998,6 +2427,7 @@ static void grab_path_buffers(struct btrfs_path *path,
1998 * in that case this function just updates pointers. 2427 * in that case this function just updates pointers.
1999 */ 2428 */
2000static int do_relocation(struct btrfs_trans_handle *trans, 2429static int do_relocation(struct btrfs_trans_handle *trans,
2430 struct reloc_control *rc,
2001 struct backref_node *node, 2431 struct backref_node *node,
2002 struct btrfs_key *key, 2432 struct btrfs_key *key,
2003 struct btrfs_path *path, int lowest) 2433 struct btrfs_path *path, int lowest)
@@ -2018,18 +2448,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2018 BUG_ON(lowest && node->eb); 2448 BUG_ON(lowest && node->eb);
2019 2449
2020 path->lowest_level = node->level + 1; 2450 path->lowest_level = node->level + 1;
2451 rc->backref_cache.path[node->level] = node;
2021 list_for_each_entry(edge, &node->upper, list[LOWER]) { 2452 list_for_each_entry(edge, &node->upper, list[LOWER]) {
2022 cond_resched(); 2453 cond_resched();
2023 if (node->eb && node->eb->start == edge->blockptr)
2024 continue;
2025 2454
2026 upper = edge->node[UPPER]; 2455 upper = edge->node[UPPER];
2027 root = select_reloc_root(trans, upper, edges, &nr); 2456 root = select_reloc_root(trans, rc, upper, edges, &nr);
2028 if (!root) 2457 BUG_ON(!root);
2029 continue; 2458
2030 2459 if (upper->eb && !upper->locked) {
2031 if (upper->eb && !upper->locked) 2460 if (!lowest) {
2461 ret = btrfs_bin_search(upper->eb, key,
2462 upper->level, &slot);
2463 BUG_ON(ret);
2464 bytenr = btrfs_node_blockptr(upper->eb, slot);
2465 if (node->eb->start == bytenr)
2466 goto next;
2467 }
2032 drop_node_buffer(upper); 2468 drop_node_buffer(upper);
2469 }
2033 2470
2034 if (!upper->eb) { 2471 if (!upper->eb) {
2035 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2472 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2039,11 +2476,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2039 } 2476 }
2040 BUG_ON(ret > 0); 2477 BUG_ON(ret > 0);
2041 2478
2042 slot = path->slots[upper->level]; 2479 if (!upper->eb) {
2480 upper->eb = path->nodes[upper->level];
2481 path->nodes[upper->level] = NULL;
2482 } else {
2483 BUG_ON(upper->eb != path->nodes[upper->level]);
2484 }
2043 2485
2044 btrfs_unlock_up_safe(path, upper->level + 1); 2486 upper->locked = 1;
2045 grab_path_buffers(path, upper, edges, nr); 2487 path->locks[upper->level] = 0;
2046 2488
2489 slot = path->slots[upper->level];
2047 btrfs_release_path(NULL, path); 2490 btrfs_release_path(NULL, path);
2048 } else { 2491 } else {
2049 ret = btrfs_bin_search(upper->eb, key, upper->level, 2492 ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2052,14 +2495,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2052 } 2495 }
2053 2496
2054 bytenr = btrfs_node_blockptr(upper->eb, slot); 2497 bytenr = btrfs_node_blockptr(upper->eb, slot);
2055 if (!lowest) { 2498 if (lowest) {
2056 if (node->eb->start == bytenr) { 2499 BUG_ON(bytenr != node->bytenr);
2057 btrfs_tree_unlock(upper->eb);
2058 upper->locked = 0;
2059 continue;
2060 }
2061 } else { 2500 } else {
2062 BUG_ON(node->bytenr != bytenr); 2501 if (node->eb->start == bytenr)
2502 goto next;
2063 } 2503 }
2064 2504
2065 blocksize = btrfs_level_size(root, node->level); 2505 blocksize = btrfs_level_size(root, node->level);
@@ -2071,13 +2511,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2071 if (!node->eb) { 2511 if (!node->eb) {
2072 ret = btrfs_cow_block(trans, root, eb, upper->eb, 2512 ret = btrfs_cow_block(trans, root, eb, upper->eb,
2073 slot, &eb); 2513 slot, &eb);
2514 btrfs_tree_unlock(eb);
2515 free_extent_buffer(eb);
2074 if (ret < 0) { 2516 if (ret < 0) {
2075 err = ret; 2517 err = ret;
2076 break; 2518 goto next;
2077 } 2519 }
2078 btrfs_set_lock_blocking(eb); 2520 BUG_ON(node->eb != eb);
2079 node->eb = eb;
2080 node->locked = 1;
2081 } else { 2521 } else {
2082 btrfs_set_node_blockptr(upper->eb, slot, 2522 btrfs_set_node_blockptr(upper->eb, slot,
2083 node->eb->start); 2523 node->eb->start);
@@ -2095,67 +2535,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2095 ret = btrfs_drop_subtree(trans, root, eb, upper->eb); 2535 ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
2096 BUG_ON(ret); 2536 BUG_ON(ret);
2097 } 2537 }
2098 if (!lowest) { 2538next:
2099 btrfs_tree_unlock(upper->eb); 2539 if (!upper->pending)
2100 upper->locked = 0; 2540 drop_node_buffer(upper);
2101 } 2541 else
2542 unlock_node_buffer(upper);
2543 if (err)
2544 break;
2102 } 2545 }
2546
2547 if (!err && node->pending) {
2548 drop_node_buffer(node);
2549 list_move_tail(&node->list, &rc->backref_cache.changed);
2550 node->pending = 0;
2551 }
2552
2103 path->lowest_level = 0; 2553 path->lowest_level = 0;
2554 BUG_ON(err == -ENOSPC);
2104 return err; 2555 return err;
2105} 2556}
2106 2557
2107static int link_to_upper(struct btrfs_trans_handle *trans, 2558static int link_to_upper(struct btrfs_trans_handle *trans,
2559 struct reloc_control *rc,
2108 struct backref_node *node, 2560 struct backref_node *node,
2109 struct btrfs_path *path) 2561 struct btrfs_path *path)
2110{ 2562{
2111 struct btrfs_key key; 2563 struct btrfs_key key;
2112 if (!node->eb || list_empty(&node->upper))
2113 return 0;
2114 2564
2115 btrfs_node_key_to_cpu(node->eb, &key, 0); 2565 btrfs_node_key_to_cpu(node->eb, &key, 0);
2116 return do_relocation(trans, node, &key, path, 0); 2566 return do_relocation(trans, rc, node, &key, path, 0);
2117} 2567}
2118 2568
2119static int finish_pending_nodes(struct btrfs_trans_handle *trans, 2569static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2120 struct backref_cache *cache, 2570 struct reloc_control *rc,
2121 struct btrfs_path *path) 2571 struct btrfs_path *path, int err)
2122{ 2572{
2573 LIST_HEAD(list);
2574 struct backref_cache *cache = &rc->backref_cache;
2123 struct backref_node *node; 2575 struct backref_node *node;
2124 int level; 2576 int level;
2125 int ret; 2577 int ret;
2126 int err = 0;
2127 2578
2128 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2579 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2129 while (!list_empty(&cache->pending[level])) { 2580 while (!list_empty(&cache->pending[level])) {
2130 node = list_entry(cache->pending[level].next, 2581 node = list_entry(cache->pending[level].next,
2131 struct backref_node, lower); 2582 struct backref_node, list);
2132 BUG_ON(node->level != level); 2583 list_move_tail(&node->list, &list);
2584 BUG_ON(!node->pending);
2133 2585
2134 ret = link_to_upper(trans, node, path); 2586 if (!err) {
2135 if (ret < 0) 2587 ret = link_to_upper(trans, rc, node, path);
2136 err = ret; 2588 if (ret < 0)
2137 /* 2589 err = ret;
2138 * this remove the node from the pending list and 2590 }
2139 * may add some other nodes to the level + 1
2140 * pending list
2141 */
2142 remove_backref_node(cache, node);
2143 } 2591 }
2592 list_splice_init(&list, &cache->pending[level]);
2144 } 2593 }
2145 BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
2146 return err; 2594 return err;
2147} 2595}
2148 2596
2149static void mark_block_processed(struct reloc_control *rc, 2597static void mark_block_processed(struct reloc_control *rc,
2150 struct backref_node *node) 2598 u64 bytenr, u32 blocksize)
2599{
2600 set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
2601 EXTENT_DIRTY, GFP_NOFS);
2602}
2603
2604static void __mark_block_processed(struct reloc_control *rc,
2605 struct backref_node *node)
2151{ 2606{
2152 u32 blocksize; 2607 u32 blocksize;
2153 if (node->level == 0 || 2608 if (node->level == 0 ||
2154 in_block_group(node->bytenr, rc->block_group)) { 2609 in_block_group(node->bytenr, rc->block_group)) {
2155 blocksize = btrfs_level_size(rc->extent_root, node->level); 2610 blocksize = btrfs_level_size(rc->extent_root, node->level);
2156 set_extent_bits(&rc->processed_blocks, node->bytenr, 2611 mark_block_processed(rc, node->bytenr, blocksize);
2157 node->bytenr + blocksize - 1, EXTENT_DIRTY,
2158 GFP_NOFS);
2159 } 2612 }
2160 node->processed = 1; 2613 node->processed = 1;
2161} 2614}
@@ -2178,7 +2631,7 @@ static void update_processed_blocks(struct reloc_control *rc,
2178 if (next->processed) 2631 if (next->processed)
2179 break; 2632 break;
2180 2633
2181 mark_block_processed(rc, next); 2634 __mark_block_processed(rc, next);
2182 2635
2183 if (list_empty(&next->upper)) 2636 if (list_empty(&next->upper))
2184 break; 2637 break;
@@ -2201,138 +2654,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
2201 return 0; 2654 return 0;
2202} 2655}
2203 2656
2204/*
2205 * check if there are any file extent pointers in the leaf point to
2206 * data require processing
2207 */
2208static int check_file_extents(struct reloc_control *rc,
2209 u64 bytenr, u32 blocksize, u64 ptr_gen)
2210{
2211 struct btrfs_key found_key;
2212 struct btrfs_file_extent_item *fi;
2213 struct extent_buffer *leaf;
2214 u32 nritems;
2215 int i;
2216 int ret = 0;
2217
2218 leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
2219
2220 nritems = btrfs_header_nritems(leaf);
2221 for (i = 0; i < nritems; i++) {
2222 cond_resched();
2223 btrfs_item_key_to_cpu(leaf, &found_key, i);
2224 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
2225 continue;
2226 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
2227 if (btrfs_file_extent_type(leaf, fi) ==
2228 BTRFS_FILE_EXTENT_INLINE)
2229 continue;
2230 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
2231 if (bytenr == 0)
2232 continue;
2233 if (in_block_group(bytenr, rc->block_group)) {
2234 ret = 1;
2235 break;
2236 }
2237 }
2238 free_extent_buffer(leaf);
2239 return ret;
2240}
2241
2242/*
2243 * scan child blocks of a given block to find blocks require processing
2244 */
2245static int add_child_blocks(struct btrfs_trans_handle *trans,
2246 struct reloc_control *rc,
2247 struct backref_node *node,
2248 struct rb_root *blocks)
2249{
2250 struct tree_block *block;
2251 struct rb_node *rb_node;
2252 u64 bytenr;
2253 u64 ptr_gen;
2254 u32 blocksize;
2255 u32 nritems;
2256 int i;
2257 int err = 0;
2258
2259 nritems = btrfs_header_nritems(node->eb);
2260 blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
2261 for (i = 0; i < nritems; i++) {
2262 cond_resched();
2263 bytenr = btrfs_node_blockptr(node->eb, i);
2264 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2265 if (ptr_gen == trans->transid)
2266 continue;
2267 if (!in_block_group(bytenr, rc->block_group) &&
2268 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2269 continue;
2270 if (tree_block_processed(bytenr, blocksize, rc))
2271 continue;
2272
2273 readahead_tree_block(rc->extent_root,
2274 bytenr, blocksize, ptr_gen);
2275 }
2276
2277 for (i = 0; i < nritems; i++) {
2278 cond_resched();
2279 bytenr = btrfs_node_blockptr(node->eb, i);
2280 ptr_gen = btrfs_node_ptr_generation(node->eb, i);
2281 if (ptr_gen == trans->transid)
2282 continue;
2283 if (!in_block_group(bytenr, rc->block_group) &&
2284 (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
2285 continue;
2286 if (tree_block_processed(bytenr, blocksize, rc))
2287 continue;
2288 if (!in_block_group(bytenr, rc->block_group) &&
2289 !check_file_extents(rc, bytenr, blocksize, ptr_gen))
2290 continue;
2291
2292 block = kmalloc(sizeof(*block), GFP_NOFS);
2293 if (!block) {
2294 err = -ENOMEM;
2295 break;
2296 }
2297 block->bytenr = bytenr;
2298 btrfs_node_key_to_cpu(node->eb, &block->key, i);
2299 block->level = node->level - 1;
2300 block->key_ready = 1;
2301 rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
2302 BUG_ON(rb_node);
2303 }
2304 if (err)
2305 free_block_list(blocks);
2306 return err;
2307}
2308
2309/*
2310 * find adjacent blocks require processing
2311 */
2312static noinline_for_stack
2313int add_adjacent_blocks(struct btrfs_trans_handle *trans,
2314 struct reloc_control *rc,
2315 struct backref_cache *cache,
2316 struct rb_root *blocks, int level,
2317 struct backref_node **upper)
2318{
2319 struct backref_node *node;
2320 int ret = 0;
2321
2322 WARN_ON(!list_empty(&cache->pending[level]));
2323
2324 if (list_empty(&cache->pending[level + 1]))
2325 return 1;
2326
2327 node = list_entry(cache->pending[level + 1].next,
2328 struct backref_node, lower);
2329 if (node->eb)
2330 ret = add_child_blocks(trans, rc, node, blocks);
2331
2332 *upper = node;
2333 return ret;
2334}
2335
2336static int get_tree_block_key(struct reloc_control *rc, 2657static int get_tree_block_key(struct reloc_control *rc,
2337 struct tree_block *block) 2658 struct tree_block *block)
2338{ 2659{
@@ -2370,40 +2691,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2370 struct btrfs_path *path) 2691 struct btrfs_path *path)
2371{ 2692{
2372 struct btrfs_root *root; 2693 struct btrfs_root *root;
2373 int ret; 2694 int release = 0;
2695 int ret = 0;
2374 2696
2697 if (!node)
2698 return 0;
2699
2700 BUG_ON(node->processed);
2375 root = select_one_root(trans, node); 2701 root = select_one_root(trans, node);
2376 if (unlikely(!root)) { 2702 if (root == ERR_PTR(-ENOENT)) {
2377 rc->found_old_snapshot = 1;
2378 update_processed_blocks(rc, node); 2703 update_processed_blocks(rc, node);
2379 return 0; 2704 goto out;
2380 } 2705 }
2381 2706
2382 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { 2707 if (!root || root->ref_cows) {
2383 ret = do_relocation(trans, node, key, path, 1); 2708 ret = reserve_metadata_space(trans, rc, node);
2384 if (ret < 0) 2709 if (ret)
2385 goto out;
2386 if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
2387 ret = replace_file_extents(trans, rc, root,
2388 node->eb, NULL);
2389 if (ret < 0)
2390 goto out;
2391 }
2392 drop_node_buffer(node);
2393 } else if (!root->ref_cows) {
2394 path->lowest_level = node->level;
2395 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2396 btrfs_release_path(root, path);
2397 if (ret < 0)
2398 goto out; 2710 goto out;
2399 } else if (root != node->root) { 2711 release = 1;
2400 WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
2401 } 2712 }
2402 2713
2403 update_processed_blocks(rc, node); 2714 if (root) {
2404 ret = 0; 2715 if (root->ref_cows) {
2716 BUG_ON(node->new_bytenr);
2717 BUG_ON(!list_empty(&node->list));
2718 btrfs_record_root_in_trans(trans, root);
2719 root = root->reloc_root;
2720 node->new_bytenr = root->node->start;
2721 node->root = root;
2722 list_add_tail(&node->list, &rc->backref_cache.changed);
2723 } else {
2724 path->lowest_level = node->level;
2725 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2726 btrfs_release_path(root, path);
2727 if (ret > 0)
2728 ret = 0;
2729 }
2730 if (!ret)
2731 update_processed_blocks(rc, node);
2732 } else {
2733 ret = do_relocation(trans, rc, node, key, path, 1);
2734 }
2405out: 2735out:
2406 drop_node_buffer(node); 2736 if (ret || node->level == 0 || node->cowonly) {
2737 if (release)
2738 release_metadata_space(rc, node);
2739 remove_backref_node(&rc->backref_cache, node);
2740 }
2407 return ret; 2741 return ret;
2408} 2742}
2409 2743
@@ -2414,12 +2748,10 @@ static noinline_for_stack
2414int relocate_tree_blocks(struct btrfs_trans_handle *trans, 2748int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2415 struct reloc_control *rc, struct rb_root *blocks) 2749 struct reloc_control *rc, struct rb_root *blocks)
2416{ 2750{
2417 struct backref_cache *cache;
2418 struct backref_node *node; 2751 struct backref_node *node;
2419 struct btrfs_path *path; 2752 struct btrfs_path *path;
2420 struct tree_block *block; 2753 struct tree_block *block;
2421 struct rb_node *rb_node; 2754 struct rb_node *rb_node;
2422 int level = -1;
2423 int ret; 2755 int ret;
2424 int err = 0; 2756 int err = 0;
2425 2757
@@ -2427,21 +2759,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2427 if (!path) 2759 if (!path)
2428 return -ENOMEM; 2760 return -ENOMEM;
2429 2761
2430 cache = kmalloc(sizeof(*cache), GFP_NOFS);
2431 if (!cache) {
2432 btrfs_free_path(path);
2433 return -ENOMEM;
2434 }
2435
2436 backref_cache_init(cache);
2437
2438 rb_node = rb_first(blocks); 2762 rb_node = rb_first(blocks);
2439 while (rb_node) { 2763 while (rb_node) {
2440 block = rb_entry(rb_node, struct tree_block, rb_node); 2764 block = rb_entry(rb_node, struct tree_block, rb_node);
2441 if (level == -1)
2442 level = block->level;
2443 else
2444 BUG_ON(level != block->level);
2445 if (!block->key_ready) 2765 if (!block->key_ready)
2446 reada_tree_block(rc, block); 2766 reada_tree_block(rc, block);
2447 rb_node = rb_next(rb_node); 2767 rb_node = rb_next(rb_node);
@@ -2459,7 +2779,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2459 while (rb_node) { 2779 while (rb_node) {
2460 block = rb_entry(rb_node, struct tree_block, rb_node); 2780 block = rb_entry(rb_node, struct tree_block, rb_node);
2461 2781
2462 node = build_backref_tree(rc, cache, &block->key, 2782 node = build_backref_tree(rc, &block->key,
2463 block->level, block->bytenr); 2783 block->level, block->bytenr);
2464 if (IS_ERR(node)) { 2784 if (IS_ERR(node)) {
2465 err = PTR_ERR(node); 2785 err = PTR_ERR(node);
@@ -2469,79 +2789,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2469 ret = relocate_tree_block(trans, rc, node, &block->key, 2789 ret = relocate_tree_block(trans, rc, node, &block->key,
2470 path); 2790 path);
2471 if (ret < 0) { 2791 if (ret < 0) {
2472 err = ret; 2792 if (ret != -EAGAIN || rb_node == rb_first(blocks))
2793 err = ret;
2473 goto out; 2794 goto out;
2474 } 2795 }
2475 remove_backref_node(cache, node);
2476 rb_node = rb_next(rb_node); 2796 rb_node = rb_next(rb_node);
2477 } 2797 }
2478 2798out:
2479 if (level > 0)
2480 goto out;
2481
2482 free_block_list(blocks); 2799 free_block_list(blocks);
2800 err = finish_pending_nodes(trans, rc, path, err);
2483 2801
2484 /* 2802 btrfs_free_path(path);
2485 * now backrefs of some upper level tree blocks have been cached, 2803 return err;
2486 * try relocating blocks referenced by these upper level blocks. 2804}
2487 */
2488 while (1) {
2489 struct backref_node *upper = NULL;
2490 if (trans->transaction->in_commit ||
2491 trans->transaction->delayed_refs.flushing)
2492 break;
2493 2805
2494 ret = add_adjacent_blocks(trans, rc, cache, blocks, level, 2806static noinline_for_stack
2495 &upper); 2807int prealloc_file_extent_cluster(struct inode *inode,
2496 if (ret < 0) 2808 struct file_extent_cluster *cluster)
2497 err = ret; 2809{
2498 if (ret != 0) 2810 u64 alloc_hint = 0;
2499 break; 2811 u64 start;
2812 u64 end;
2813 u64 offset = BTRFS_I(inode)->index_cnt;
2814 u64 num_bytes;
2815 int nr = 0;
2816 int ret = 0;
2500 2817
2501 rb_node = rb_first(blocks); 2818 BUG_ON(cluster->start != cluster->boundary[0]);
2502 while (rb_node) { 2819 mutex_lock(&inode->i_mutex);
2503 block = rb_entry(rb_node, struct tree_block, rb_node);
2504 if (trans->transaction->in_commit ||
2505 trans->transaction->delayed_refs.flushing)
2506 goto out;
2507 BUG_ON(!block->key_ready);
2508 node = build_backref_tree(rc, cache, &block->key,
2509 level, block->bytenr);
2510 if (IS_ERR(node)) {
2511 err = PTR_ERR(node);
2512 goto out;
2513 }
2514 2820
2515 ret = relocate_tree_block(trans, rc, node, 2821 ret = btrfs_check_data_free_space(inode, cluster->end +
2516 &block->key, path); 2822 1 - cluster->start);
2517 if (ret < 0) { 2823 if (ret)
2518 err = ret; 2824 goto out;
2519 goto out;
2520 }
2521 remove_backref_node(cache, node);
2522 rb_node = rb_next(rb_node);
2523 }
2524 free_block_list(blocks);
2525 2825
2526 if (upper) { 2826 while (nr < cluster->nr) {
2527 ret = link_to_upper(trans, upper, path); 2827 start = cluster->boundary[nr] - offset;
2528 if (ret < 0) { 2828 if (nr + 1 < cluster->nr)
2529 err = ret; 2829 end = cluster->boundary[nr + 1] - 1 - offset;
2530 break; 2830 else
2531 } 2831 end = cluster->end - offset;
2532 remove_backref_node(cache, upper); 2832
2533 } 2833 lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2834 num_bytes = end + 1 - start;
2835 ret = btrfs_prealloc_file_range(inode, 0, start,
2836 num_bytes, num_bytes,
2837 end + 1, &alloc_hint);
2838 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
2839 if (ret)
2840 break;
2841 nr++;
2534 } 2842 }
2843 btrfs_free_reserved_data_space(inode, cluster->end +
2844 1 - cluster->start);
2535out: 2845out:
2536 free_block_list(blocks); 2846 mutex_unlock(&inode->i_mutex);
2537 2847 return ret;
2538 ret = finish_pending_nodes(trans, cache, path);
2539 if (ret < 0)
2540 err = ret;
2541
2542 kfree(cache);
2543 btrfs_free_path(path);
2544 return err;
2545} 2848}
2546 2849
2547static noinline_for_stack 2850static noinline_for_stack
@@ -2587,7 +2890,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
2587 u64 offset = BTRFS_I(inode)->index_cnt; 2890 u64 offset = BTRFS_I(inode)->index_cnt;
2588 unsigned long index; 2891 unsigned long index;
2589 unsigned long last_index; 2892 unsigned long last_index;
2590 unsigned int dirty_page = 0;
2591 struct page *page; 2893 struct page *page;
2592 struct file_ra_state *ra; 2894 struct file_ra_state *ra;
2593 int nr = 0; 2895 int nr = 0;
@@ -2600,21 +2902,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
2600 if (!ra) 2902 if (!ra)
2601 return -ENOMEM; 2903 return -ENOMEM;
2602 2904
2603 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT; 2905 ret = prealloc_file_extent_cluster(inode, cluster);
2604 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT; 2906 if (ret)
2907 goto out;
2605 2908
2606 mutex_lock(&inode->i_mutex); 2909 file_ra_state_init(ra, inode->i_mapping);
2607 2910
2608 i_size_write(inode, cluster->end + 1 - offset);
2609 ret = setup_extent_mapping(inode, cluster->start - offset, 2911 ret = setup_extent_mapping(inode, cluster->start - offset,
2610 cluster->end - offset, cluster->start); 2912 cluster->end - offset, cluster->start);
2611 if (ret) 2913 if (ret)
2612 goto out_unlock; 2914 goto out;
2613
2614 file_ra_state_init(ra, inode->i_mapping);
2615 2915
2616 WARN_ON(cluster->start != cluster->boundary[0]); 2916 index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
2917 last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
2617 while (index <= last_index) { 2918 while (index <= last_index) {
2919 ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
2920 if (ret)
2921 goto out;
2922
2618 page = find_lock_page(inode->i_mapping, index); 2923 page = find_lock_page(inode->i_mapping, index);
2619 if (!page) { 2924 if (!page) {
2620 page_cache_sync_readahead(inode->i_mapping, 2925 page_cache_sync_readahead(inode->i_mapping,
@@ -2622,8 +2927,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2622 last_index + 1 - index); 2927 last_index + 1 - index);
2623 page = grab_cache_page(inode->i_mapping, index); 2928 page = grab_cache_page(inode->i_mapping, index);
2624 if (!page) { 2929 if (!page) {
2930 btrfs_delalloc_release_metadata(inode,
2931 PAGE_CACHE_SIZE);
2625 ret = -ENOMEM; 2932 ret = -ENOMEM;
2626 goto out_unlock; 2933 goto out;
2627 } 2934 }
2628 } 2935 }
2629 2936
@@ -2639,8 +2946,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
2639 if (!PageUptodate(page)) { 2946 if (!PageUptodate(page)) {
2640 unlock_page(page); 2947 unlock_page(page);
2641 page_cache_release(page); 2948 page_cache_release(page);
2949 btrfs_delalloc_release_metadata(inode,
2950 PAGE_CACHE_SIZE);
2642 ret = -EIO; 2951 ret = -EIO;
2643 goto out_unlock; 2952 goto out;
2644 } 2953 }
2645 } 2954 }
2646 2955
@@ -2659,10 +2968,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2968 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2969 nr++;
2661 } 2970 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end);
2663 2971
2972 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2664 set_page_dirty(page); 2973 set_page_dirty(page);
2665 dirty_page++;
2666 2974
2667 unlock_extent(&BTRFS_I(inode)->io_tree, 2975 unlock_extent(&BTRFS_I(inode)->io_tree,
2668 page_start, page_end, GFP_NOFS); 2976 page_start, page_end, GFP_NOFS);
@@ -2670,20 +2978,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
2670 page_cache_release(page); 2978 page_cache_release(page);
2671 2979
2672 index++; 2980 index++;
2673 if (nr < cluster->nr && 2981 balance_dirty_pages_ratelimited(inode->i_mapping);
2674 page_end + 1 + offset == cluster->boundary[nr]) { 2982 btrfs_throttle(BTRFS_I(inode)->root);
2675 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2676 dirty_page);
2677 dirty_page = 0;
2678 }
2679 }
2680 if (dirty_page) {
2681 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
2682 dirty_page);
2683 } 2983 }
2684 WARN_ON(nr != cluster->nr); 2984 WARN_ON(nr != cluster->nr);
2685out_unlock: 2985out:
2686 mutex_unlock(&inode->i_mutex);
2687 kfree(ra); 2986 kfree(ra);
2688 return ret; 2987 return ret;
2689} 2988}
@@ -2795,6 +3094,8 @@ static int add_tree_block(struct reloc_control *rc,
2795 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3094 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2796 ret = get_ref_objectid_v0(rc, path, extent_key, 3095 ret = get_ref_objectid_v0(rc, path, extent_key,
2797 &ref_owner, NULL); 3096 &ref_owner, NULL);
3097 if (ret < 0)
3098 return ret;
2798 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); 3099 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
2799 level = (int)ref_owner; 3100 level = (int)ref_owner;
2800 /* FIXME: get real generation */ 3101 /* FIXME: get real generation */
@@ -2869,9 +3170,6 @@ out:
2869static int block_use_full_backref(struct reloc_control *rc, 3170static int block_use_full_backref(struct reloc_control *rc,
2870 struct extent_buffer *eb) 3171 struct extent_buffer *eb)
2871{ 3172{
2872 struct btrfs_path *path;
2873 struct btrfs_extent_item *ei;
2874 struct btrfs_key key;
2875 u64 flags; 3173 u64 flags;
2876 int ret; 3174 int ret;
2877 3175
@@ -2879,28 +3177,62 @@ static int block_use_full_backref(struct reloc_control *rc,
2879 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV) 3177 btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
2880 return 1; 3178 return 1;
2881 3179
2882 path = btrfs_alloc_path(); 3180 ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
2883 BUG_ON(!path); 3181 eb->start, eb->len, NULL, &flags);
2884
2885 key.objectid = eb->start;
2886 key.type = BTRFS_EXTENT_ITEM_KEY;
2887 key.offset = eb->len;
2888
2889 path->search_commit_root = 1;
2890 path->skip_locking = 1;
2891 ret = btrfs_search_slot(NULL, rc->extent_root,
2892 &key, path, 0, 0);
2893 BUG_ON(ret); 3182 BUG_ON(ret);
2894 3183
2895 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
2896 struct btrfs_extent_item);
2897 flags = btrfs_extent_flags(path->nodes[0], ei);
2898 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2899 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) 3184 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
2900 ret = 1; 3185 ret = 1;
2901 else 3186 else
2902 ret = 0; 3187 ret = 0;
3188 return ret;
3189}
3190
3191static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3192 struct inode *inode, u64 ino)
3193{
3194 struct btrfs_key key;
3195 struct btrfs_path *path;
3196 struct btrfs_root *root = fs_info->tree_root;
3197 struct btrfs_trans_handle *trans;
3198 unsigned long nr;
3199 int ret = 0;
3200
3201 if (inode)
3202 goto truncate;
3203
3204 key.objectid = ino;
3205 key.type = BTRFS_INODE_ITEM_KEY;
3206 key.offset = 0;
3207
3208 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3209 if (!inode || IS_ERR(inode) || is_bad_inode(inode)) {
3210 if (inode && !IS_ERR(inode))
3211 iput(inode);
3212 return -ENOENT;
3213 }
3214
3215truncate:
3216 path = btrfs_alloc_path();
3217 if (!path) {
3218 ret = -ENOMEM;
3219 goto out;
3220 }
3221
3222 trans = btrfs_join_transaction(root, 0);
3223 if (IS_ERR(trans)) {
3224 btrfs_free_path(path);
3225 goto out;
3226 }
3227
3228 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3229
2903 btrfs_free_path(path); 3230 btrfs_free_path(path);
3231 nr = trans->blocks_used;
3232 btrfs_end_transaction(trans, root);
3233 btrfs_btree_balance_dirty(root, nr);
3234out:
3235 iput(inode);
2904 return ret; 3236 return ret;
2905} 3237}
2906 3238
@@ -2930,15 +3262,27 @@ static int find_data_references(struct reloc_control *rc,
2930 int counted; 3262 int counted;
2931 int ret; 3263 int ret;
2932 3264
2933 path = btrfs_alloc_path();
2934 if (!path)
2935 return -ENOMEM;
2936
2937 ref_root = btrfs_extent_data_ref_root(leaf, ref); 3265 ref_root = btrfs_extent_data_ref_root(leaf, ref);
2938 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); 3266 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
2939 ref_offset = btrfs_extent_data_ref_offset(leaf, ref); 3267 ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
2940 ref_count = btrfs_extent_data_ref_count(leaf, ref); 3268 ref_count = btrfs_extent_data_ref_count(leaf, ref);
2941 3269
3270 /*
3271 * This is an extent belonging to the free space cache, lets just delete
3272 * it and redo the search.
3273 */
3274 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3275 ret = delete_block_group_cache(rc->extent_root->fs_info,
3276 NULL, ref_objectid);
3277 if (ret != -ENOENT)
3278 return ret;
3279 ret = 0;
3280 }
3281
3282 path = btrfs_alloc_path();
3283 if (!path)
3284 return -ENOMEM;
3285
2942 root = read_fs_root(rc->extent_root->fs_info, ref_root); 3286 root = read_fs_root(rc->extent_root->fs_info, ref_root);
2943 if (IS_ERR(root)) { 3287 if (IS_ERR(root)) {
2944 err = PTR_ERR(root); 3288 err = PTR_ERR(root);
@@ -3073,22 +3417,10 @@ int add_data_references(struct reloc_control *rc,
3073 struct btrfs_extent_inline_ref *iref; 3417 struct btrfs_extent_inline_ref *iref;
3074 unsigned long ptr; 3418 unsigned long ptr;
3075 unsigned long end; 3419 unsigned long end;
3076 u32 blocksize; 3420 u32 blocksize = btrfs_level_size(rc->extent_root, 0);
3077 int ret; 3421 int ret;
3078 int err = 0; 3422 int err = 0;
3079 3423
3080 ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
3081 extent_key->offset);
3082 BUG_ON(ret < 0);
3083 if (ret > 0) {
3084 /* the relocated data is fragmented */
3085 rc->extents_skipped++;
3086 btrfs_release_path(rc->extent_root, path);
3087 return 0;
3088 }
3089
3090 blocksize = btrfs_level_size(rc->extent_root, 0);
3091
3092 eb = path->nodes[0]; 3424 eb = path->nodes[0];
3093 ptr = btrfs_item_ptr_offset(eb, path->slots[0]); 3425 ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
3094 end = ptr + btrfs_item_size_nr(eb, path->slots[0]); 3426 end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3169,7 +3501,8 @@ int add_data_references(struct reloc_control *rc,
3169 */ 3501 */
3170static noinline_for_stack 3502static noinline_for_stack
3171int find_next_extent(struct btrfs_trans_handle *trans, 3503int find_next_extent(struct btrfs_trans_handle *trans,
3172 struct reloc_control *rc, struct btrfs_path *path) 3504 struct reloc_control *rc, struct btrfs_path *path,
3505 struct btrfs_key *extent_key)
3173{ 3506{
3174 struct btrfs_key key; 3507 struct btrfs_key key;
3175 struct extent_buffer *leaf; 3508 struct extent_buffer *leaf;
@@ -3224,6 +3557,7 @@ next:
3224 rc->search_start = end + 1; 3557 rc->search_start = end + 1;
3225 } else { 3558 } else {
3226 rc->search_start = key.objectid + key.offset; 3559 rc->search_start = key.objectid + key.offset;
3560 memcpy(extent_key, &key, sizeof(key));
3227 return 0; 3561 return 0;
3228 } 3562 }
3229 } 3563 }
@@ -3261,12 +3595,47 @@ static int check_extent_flags(u64 flags)
3261 return 0; 3595 return 0;
3262} 3596}
3263 3597
3598static noinline_for_stack
3599int prepare_to_relocate(struct reloc_control *rc)
3600{
3601 struct btrfs_trans_handle *trans;
3602 int ret;
3603
3604 rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
3605 if (!rc->block_rsv)
3606 return -ENOMEM;
3607
3608 /*
3609 * reserve some space for creating reloc trees.
3610 * btrfs_init_reloc_root will use them when there
3611 * is no reservation in transaction handle.
3612 */
3613 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3614 rc->extent_root->nodesize * 256);
3615 if (ret)
3616 return ret;
3617
3618 rc->block_rsv->refill_used = 1;
3619 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3620
3621 memset(&rc->cluster, 0, sizeof(rc->cluster));
3622 rc->search_start = rc->block_group->key.objectid;
3623 rc->extents_found = 0;
3624 rc->nodes_relocated = 0;
3625 rc->merging_rsv_size = 0;
3626
3627 rc->create_reloc_tree = 1;
3628 set_reloc_control(rc);
3629
3630 trans = btrfs_join_transaction(rc->extent_root, 1);
3631 btrfs_commit_transaction(trans, rc->extent_root);
3632 return 0;
3633}
3264 3634
3265static noinline_for_stack int relocate_block_group(struct reloc_control *rc) 3635static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3266{ 3636{
3267 struct rb_root blocks = RB_ROOT; 3637 struct rb_root blocks = RB_ROOT;
3268 struct btrfs_key key; 3638 struct btrfs_key key;
3269 struct file_extent_cluster *cluster;
3270 struct btrfs_trans_handle *trans = NULL; 3639 struct btrfs_trans_handle *trans = NULL;
3271 struct btrfs_path *path; 3640 struct btrfs_path *path;
3272 struct btrfs_extent_item *ei; 3641 struct btrfs_extent_item *ei;
@@ -3276,33 +3645,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3276 int ret; 3645 int ret;
3277 int err = 0; 3646 int err = 0;
3278 3647
3279 cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
3280 if (!cluster)
3281 return -ENOMEM;
3282
3283 path = btrfs_alloc_path(); 3648 path = btrfs_alloc_path();
3284 if (!path) { 3649 if (!path)
3285 kfree(cluster);
3286 return -ENOMEM; 3650 return -ENOMEM;
3287 }
3288 3651
3289 rc->extents_found = 0; 3652 ret = prepare_to_relocate(rc);
3290 rc->extents_skipped = 0; 3653 if (ret) {
3291 3654 err = ret;
3292 rc->search_start = rc->block_group->key.objectid; 3655 goto out_free;
3293 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, 3656 }
3294 GFP_NOFS);
3295
3296 rc->create_reloc_root = 1;
3297 set_reloc_control(rc);
3298
3299 trans = btrfs_start_transaction(rc->extent_root, 1);
3300 btrfs_commit_transaction(trans, rc->extent_root);
3301 3657
3302 while (1) { 3658 while (1) {
3303 trans = btrfs_start_transaction(rc->extent_root, 1); 3659 trans = btrfs_start_transaction(rc->extent_root, 0);
3660
3661 if (update_backref_cache(trans, &rc->backref_cache)) {
3662 btrfs_end_transaction(trans, rc->extent_root);
3663 continue;
3664 }
3304 3665
3305 ret = find_next_extent(trans, rc, path); 3666 ret = find_next_extent(trans, rc, path, &key);
3306 if (ret < 0) 3667 if (ret < 0)
3307 err = ret; 3668 err = ret;
3308 if (ret != 0) 3669 if (ret != 0)
@@ -3312,9 +3673,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3312 3673
3313 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], 3674 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3314 struct btrfs_extent_item); 3675 struct btrfs_extent_item);
3315 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); 3676 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
3316 item_size = btrfs_item_size_nr(path->nodes[0],
3317 path->slots[0]);
3318 if (item_size >= sizeof(*ei)) { 3677 if (item_size >= sizeof(*ei)) {
3319 flags = btrfs_extent_flags(path->nodes[0], ei); 3678 flags = btrfs_extent_flags(path->nodes[0], ei);
3320 ret = check_extent_flags(flags); 3679 ret = check_extent_flags(flags);
@@ -3355,73 +3714,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3355 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { 3714 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3356 ret = add_tree_block(rc, &key, path, &blocks); 3715 ret = add_tree_block(rc, &key, path, &blocks);
3357 } else if (rc->stage == UPDATE_DATA_PTRS && 3716 } else if (rc->stage == UPDATE_DATA_PTRS &&
3358 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3717 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3359 ret = add_data_references(rc, &key, path, &blocks); 3718 ret = add_data_references(rc, &key, path, &blocks);
3360 } else { 3719 } else {
3361 btrfs_release_path(rc->extent_root, path); 3720 btrfs_release_path(rc->extent_root, path);
3362 ret = 0; 3721 ret = 0;
3363 } 3722 }
3364 if (ret < 0) { 3723 if (ret < 0) {
3365 err = 0; 3724 err = ret;
3366 break; 3725 break;
3367 } 3726 }
3368 3727
3369 if (!RB_EMPTY_ROOT(&blocks)) { 3728 if (!RB_EMPTY_ROOT(&blocks)) {
3370 ret = relocate_tree_blocks(trans, rc, &blocks); 3729 ret = relocate_tree_blocks(trans, rc, &blocks);
3371 if (ret < 0) { 3730 if (ret < 0) {
3731 if (ret != -EAGAIN) {
3732 err = ret;
3733 break;
3734 }
3735 rc->extents_found--;
3736 rc->search_start = key.objectid;
3737 }
3738 }
3739
3740 ret = btrfs_block_rsv_check(trans, rc->extent_root,
3741 rc->block_rsv, 0, 5);
3742 if (ret < 0) {
3743 if (ret != -EAGAIN) {
3372 err = ret; 3744 err = ret;
3745 WARN_ON(1);
3373 break; 3746 break;
3374 } 3747 }
3748 rc->commit_transaction = 1;
3375 } 3749 }
3376 3750
3377 nr = trans->blocks_used; 3751 if (rc->commit_transaction) {
3378 btrfs_end_transaction(trans, rc->extent_root); 3752 rc->commit_transaction = 0;
3753 ret = btrfs_commit_transaction(trans, rc->extent_root);
3754 BUG_ON(ret);
3755 } else {
3756 nr = trans->blocks_used;
3757 btrfs_end_transaction_throttle(trans, rc->extent_root);
3758 btrfs_btree_balance_dirty(rc->extent_root, nr);
3759 }
3379 trans = NULL; 3760 trans = NULL;
3380 btrfs_btree_balance_dirty(rc->extent_root, nr);
3381 3761
3382 if (rc->stage == MOVE_DATA_EXTENTS && 3762 if (rc->stage == MOVE_DATA_EXTENTS &&
3383 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3763 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3384 rc->found_file_extent = 1; 3764 rc->found_file_extent = 1;
3385 ret = relocate_data_extent(rc->data_inode, 3765 ret = relocate_data_extent(rc->data_inode,
3386 &key, cluster); 3766 &key, &rc->cluster);
3387 if (ret < 0) { 3767 if (ret < 0) {
3388 err = ret; 3768 err = ret;
3389 break; 3769 break;
3390 } 3770 }
3391 } 3771 }
3392 } 3772 }
3393 btrfs_free_path(path); 3773
3774 btrfs_release_path(rc->extent_root, path);
3775 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3776 GFP_NOFS);
3394 3777
3395 if (trans) { 3778 if (trans) {
3396 nr = trans->blocks_used; 3779 nr = trans->blocks_used;
3397 btrfs_end_transaction(trans, rc->extent_root); 3780 btrfs_end_transaction_throttle(trans, rc->extent_root);
3398 btrfs_btree_balance_dirty(rc->extent_root, nr); 3781 btrfs_btree_balance_dirty(rc->extent_root, nr);
3399 } 3782 }
3400 3783
3401 if (!err) { 3784 if (!err) {
3402 ret = relocate_file_extent_cluster(rc->data_inode, cluster); 3785 ret = relocate_file_extent_cluster(rc->data_inode,
3786 &rc->cluster);
3403 if (ret < 0) 3787 if (ret < 0)
3404 err = ret; 3788 err = ret;
3405 } 3789 }
3406 3790
3407 kfree(cluster); 3791 rc->create_reloc_tree = 0;
3792 set_reloc_control(rc);
3408 3793
3409 rc->create_reloc_root = 0; 3794 backref_cache_cleanup(&rc->backref_cache);
3410 smp_mb(); 3795 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3411 3796
3412 if (rc->extents_found > 0) { 3797 err = prepare_to_merge(rc, err);
3413 trans = btrfs_start_transaction(rc->extent_root, 1);
3414 btrfs_commit_transaction(trans, rc->extent_root);
3415 }
3416 3798
3417 merge_reloc_roots(rc); 3799 merge_reloc_roots(rc);
3418 3800
3801 rc->merge_reloc_tree = 0;
3419 unset_reloc_control(rc); 3802 unset_reloc_control(rc);
3803 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3420 3804
3421 /* get rid of pinned extents */ 3805 /* get rid of pinned extents */
3422 trans = btrfs_start_transaction(rc->extent_root, 1); 3806 trans = btrfs_join_transaction(rc->extent_root, 1);
3423 btrfs_commit_transaction(trans, rc->extent_root); 3807 btrfs_commit_transaction(trans, rc->extent_root);
3424 3808out_free:
3809 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3810 btrfs_free_path(path);
3425 return err; 3811 return err;
3426} 3812}
3427 3813
@@ -3447,7 +3833,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3447 btrfs_set_inode_generation(leaf, item, 1); 3833 btrfs_set_inode_generation(leaf, item, 1);
3448 btrfs_set_inode_size(leaf, item, 0); 3834 btrfs_set_inode_size(leaf, item, 0);
3449 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); 3835 btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3450 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); 3836 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3837 BTRFS_INODE_PREALLOC);
3451 btrfs_mark_buffer_dirty(leaf); 3838 btrfs_mark_buffer_dirty(leaf);
3452 btrfs_release_path(root, path); 3839 btrfs_release_path(root, path);
3453out: 3840out:
@@ -3459,8 +3846,9 @@ out:
3459 * helper to create inode for data relocation. 3846 * helper to create inode for data relocation.
3460 * the inode is in data relocation tree and its link count is 0 3847 * the inode is in data relocation tree and its link count is 0
3461 */ 3848 */
3462static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, 3849static noinline_for_stack
3463 struct btrfs_block_group_cache *group) 3850struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3851 struct btrfs_block_group_cache *group)
3464{ 3852{
3465 struct inode *inode = NULL; 3853 struct inode *inode = NULL;
3466 struct btrfs_trans_handle *trans; 3854 struct btrfs_trans_handle *trans;
@@ -3474,8 +3862,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3474 if (IS_ERR(root)) 3862 if (IS_ERR(root))
3475 return ERR_CAST(root); 3863 return ERR_CAST(root);
3476 3864
3477 trans = btrfs_start_transaction(root, 1); 3865 trans = btrfs_start_transaction(root, 6);
3478 BUG_ON(!trans); 3866 if (IS_ERR(trans))
3867 return ERR_CAST(trans);
3479 3868
3480 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3869 err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
3481 if (err) 3870 if (err)
@@ -3487,7 +3876,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3487 key.objectid = objectid; 3876 key.objectid = objectid;
3488 key.type = BTRFS_INODE_ITEM_KEY; 3877 key.type = BTRFS_INODE_ITEM_KEY;
3489 key.offset = 0; 3878 key.offset = 0;
3490 inode = btrfs_iget(root->fs_info->sb, &key, root); 3879 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3491 BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); 3880 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3492 BTRFS_I(inode)->index_cnt = group->key.objectid; 3881 BTRFS_I(inode)->index_cnt = group->key.objectid;
3493 3882
@@ -3495,7 +3884,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3495out: 3884out:
3496 nr = trans->blocks_used; 3885 nr = trans->blocks_used;
3497 btrfs_end_transaction(trans, root); 3886 btrfs_end_transaction(trans, root);
3498
3499 btrfs_btree_balance_dirty(root, nr); 3887 btrfs_btree_balance_dirty(root, nr);
3500 if (err) { 3888 if (err) {
3501 if (inode) 3889 if (inode)
@@ -3505,6 +3893,21 @@ out:
3505 return inode; 3893 return inode;
3506} 3894}
3507 3895
3896static struct reloc_control *alloc_reloc_control(void)
3897{
3898 struct reloc_control *rc;
3899
3900 rc = kzalloc(sizeof(*rc), GFP_NOFS);
3901 if (!rc)
3902 return NULL;
3903
3904 INIT_LIST_HEAD(&rc->reloc_roots);
3905 backref_cache_init(&rc->backref_cache);
3906 mapping_tree_init(&rc->reloc_root_tree);
3907 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3908 return rc;
3909}
3910
3508/* 3911/*
3509 * function to relocate all extents in a block group. 3912 * function to relocate all extents in a block group.
3510 */ 3913 */
@@ -3512,25 +3915,49 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3512{ 3915{
3513 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3916 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3514 struct reloc_control *rc; 3917 struct reloc_control *rc;
3918 struct inode *inode;
3919 struct btrfs_path *path;
3515 int ret; 3920 int ret;
3921 int rw = 0;
3516 int err = 0; 3922 int err = 0;
3517 3923
3518 rc = kzalloc(sizeof(*rc), GFP_NOFS); 3924 rc = alloc_reloc_control();
3519 if (!rc) 3925 if (!rc)
3520 return -ENOMEM; 3926 return -ENOMEM;
3521 3927
3522 mapping_tree_init(&rc->reloc_root_tree); 3928 rc->extent_root = extent_root;
3523 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
3524 INIT_LIST_HEAD(&rc->reloc_roots);
3525 3929
3526 rc->block_group = btrfs_lookup_block_group(fs_info, group_start); 3930 rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
3527 BUG_ON(!rc->block_group); 3931 BUG_ON(!rc->block_group);
3528 3932
3529 btrfs_init_workers(&rc->workers, "relocate", 3933 if (!rc->block_group->ro) {
3530 fs_info->thread_pool_size, NULL); 3934 ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
3935 if (ret) {
3936 err = ret;
3937 goto out;
3938 }
3939 rw = 1;
3940 }
3531 3941
3532 rc->extent_root = extent_root; 3942 path = btrfs_alloc_path();
3533 btrfs_prepare_block_group_relocation(extent_root, rc->block_group); 3943 if (!path) {
3944 err = -ENOMEM;
3945 goto out;
3946 }
3947
3948 inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
3949 path);
3950 btrfs_free_path(path);
3951
3952 if (!IS_ERR(inode))
3953 ret = delete_block_group_cache(fs_info, inode, 0);
3954 else
3955 ret = PTR_ERR(inode);
3956
3957 if (ret && ret != -ENOENT) {
3958 err = ret;
3959 goto out;
3960 }
3534 3961
3535 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 3962 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3536 if (IS_ERR(rc->data_inode)) { 3963 if (IS_ERR(rc->data_inode)) {
@@ -3547,9 +3974,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3547 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); 3974 btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
3548 3975
3549 while (1) { 3976 while (1) {
3550 rc->extents_found = 0;
3551 rc->extents_skipped = 0;
3552
3553 mutex_lock(&fs_info->cleaner_mutex); 3977 mutex_lock(&fs_info->cleaner_mutex);
3554 3978
3555 btrfs_clean_old_snapshots(fs_info->tree_root); 3979 btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3558,7 +3982,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3558 mutex_unlock(&fs_info->cleaner_mutex); 3982 mutex_unlock(&fs_info->cleaner_mutex);
3559 if (ret < 0) { 3983 if (ret < 0) {
3560 err = ret; 3984 err = ret;
3561 break; 3985 goto out;
3562 } 3986 }
3563 3987
3564 if (rc->extents_found == 0) 3988 if (rc->extents_found == 0)
@@ -3572,18 +3996,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3572 invalidate_mapping_pages(rc->data_inode->i_mapping, 3996 invalidate_mapping_pages(rc->data_inode->i_mapping,
3573 0, -1); 3997 0, -1);
3574 rc->stage = UPDATE_DATA_PTRS; 3998 rc->stage = UPDATE_DATA_PTRS;
3575 } else if (rc->stage == UPDATE_DATA_PTRS &&
3576 rc->extents_skipped >= rc->extents_found) {
3577 iput(rc->data_inode);
3578 rc->data_inode = create_reloc_inode(fs_info,
3579 rc->block_group);
3580 if (IS_ERR(rc->data_inode)) {
3581 err = PTR_ERR(rc->data_inode);
3582 rc->data_inode = NULL;
3583 break;
3584 }
3585 rc->stage = MOVE_DATA_EXTENTS;
3586 rc->found_file_extent = 0;
3587 } 3999 }
3588 } 4000 }
3589 4001
@@ -3596,8 +4008,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3596 WARN_ON(rc->block_group->reserved > 0); 4008 WARN_ON(rc->block_group->reserved > 0);
3597 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0); 4009 WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
3598out: 4010out:
4011 if (err && rw)
4012 btrfs_set_block_group_rw(extent_root, rc->block_group);
3599 iput(rc->data_inode); 4013 iput(rc->data_inode);
3600 btrfs_stop_workers(&rc->workers);
3601 btrfs_put_block_group(rc->block_group); 4014 btrfs_put_block_group(rc->block_group);
3602 kfree(rc); 4015 kfree(rc);
3603 return err; 4016 return err;
@@ -3608,7 +4021,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3608 struct btrfs_trans_handle *trans; 4021 struct btrfs_trans_handle *trans;
3609 int ret; 4022 int ret;
3610 4023
3611 trans = btrfs_start_transaction(root->fs_info->tree_root, 1); 4024 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
3612 4025
3613 memset(&root->root_item.drop_progress, 0, 4026 memset(&root->root_item.drop_progress, 0,
3614 sizeof(root->root_item.drop_progress)); 4027 sizeof(root->root_item.drop_progress));
@@ -3701,20 +4114,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3701 if (list_empty(&reloc_roots)) 4114 if (list_empty(&reloc_roots))
3702 goto out; 4115 goto out;
3703 4116
3704 rc = kzalloc(sizeof(*rc), GFP_NOFS); 4117 rc = alloc_reloc_control();
3705 if (!rc) { 4118 if (!rc) {
3706 err = -ENOMEM; 4119 err = -ENOMEM;
3707 goto out; 4120 goto out;
3708 } 4121 }
3709 4122
3710 mapping_tree_init(&rc->reloc_root_tree);
3711 INIT_LIST_HEAD(&rc->reloc_roots);
3712 btrfs_init_workers(&rc->workers, "relocate",
3713 root->fs_info->thread_pool_size, NULL);
3714 rc->extent_root = root->fs_info->extent_root; 4123 rc->extent_root = root->fs_info->extent_root;
3715 4124
3716 set_reloc_control(rc); 4125 set_reloc_control(rc);
3717 4126
4127 trans = btrfs_join_transaction(rc->extent_root, 1);
4128
4129 rc->merge_reloc_tree = 1;
4130
3718 while (!list_empty(&reloc_roots)) { 4131 while (!list_empty(&reloc_roots)) {
3719 reloc_root = list_entry(reloc_roots.next, 4132 reloc_root = list_entry(reloc_roots.next,
3720 struct btrfs_root, root_list); 4133 struct btrfs_root, root_list);
@@ -3734,20 +4147,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3734 fs_root->reloc_root = reloc_root; 4147 fs_root->reloc_root = reloc_root;
3735 } 4148 }
3736 4149
3737 trans = btrfs_start_transaction(rc->extent_root, 1);
3738 btrfs_commit_transaction(trans, rc->extent_root); 4150 btrfs_commit_transaction(trans, rc->extent_root);
3739 4151
3740 merge_reloc_roots(rc); 4152 merge_reloc_roots(rc);
3741 4153
3742 unset_reloc_control(rc); 4154 unset_reloc_control(rc);
3743 4155
3744 trans = btrfs_start_transaction(rc->extent_root, 1); 4156 trans = btrfs_join_transaction(rc->extent_root, 1);
3745 btrfs_commit_transaction(trans, rc->extent_root); 4157 btrfs_commit_transaction(trans, rc->extent_root);
3746out: 4158out:
3747 if (rc) { 4159 kfree(rc);
3748 btrfs_stop_workers(&rc->workers);
3749 kfree(rc);
3750 }
3751 while (!list_empty(&reloc_roots)) { 4160 while (!list_empty(&reloc_roots)) {
3752 reloc_root = list_entry(reloc_roots.next, 4161 reloc_root = list_entry(reloc_roots.next,
3753 struct btrfs_root, root_list); 4162 struct btrfs_root, root_list);
@@ -3764,7 +4173,8 @@ out:
3764 BTRFS_DATA_RELOC_TREE_OBJECTID); 4173 BTRFS_DATA_RELOC_TREE_OBJECTID);
3765 if (IS_ERR(fs_root)) 4174 if (IS_ERR(fs_root))
3766 err = PTR_ERR(fs_root); 4175 err = PTR_ERR(fs_root);
3767 btrfs_orphan_cleanup(fs_root); 4176 else
4177 btrfs_orphan_cleanup(fs_root);
3768 } 4178 }
3769 return err; 4179 return err;
3770} 4180}
@@ -3810,5 +4220,132 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
3810 btrfs_add_ordered_sum(inode, ordered, sums); 4220 btrfs_add_ordered_sum(inode, ordered, sums);
3811 } 4221 }
3812 btrfs_put_ordered_extent(ordered); 4222 btrfs_put_ordered_extent(ordered);
3813 return 0; 4223 return ret;
4224}
4225
4226void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4227 struct btrfs_root *root, struct extent_buffer *buf,
4228 struct extent_buffer *cow)
4229{
4230 struct reloc_control *rc;
4231 struct backref_node *node;
4232 int first_cow = 0;
4233 int level;
4234 int ret;
4235
4236 rc = root->fs_info->reloc_ctl;
4237 if (!rc)
4238 return;
4239
4240 BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
4241 root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
4242
4243 level = btrfs_header_level(buf);
4244 if (btrfs_header_generation(buf) <=
4245 btrfs_root_last_snapshot(&root->root_item))
4246 first_cow = 1;
4247
4248 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4249 rc->create_reloc_tree) {
4250 WARN_ON(!first_cow && level == 0);
4251
4252 node = rc->backref_cache.path[level];
4253 BUG_ON(node->bytenr != buf->start &&
4254 node->new_bytenr != buf->start);
4255
4256 drop_node_buffer(node);
4257 extent_buffer_get(cow);
4258 node->eb = cow;
4259 node->new_bytenr = cow->start;
4260
4261 if (!node->pending) {
4262 list_move_tail(&node->list,
4263 &rc->backref_cache.pending[level]);
4264 node->pending = 1;
4265 }
4266
4267 if (first_cow)
4268 __mark_block_processed(rc, node);
4269
4270 if (first_cow && level > 0)
4271 rc->nodes_relocated += buf->len;
4272 }
4273
4274 if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
4275 ret = replace_file_extents(trans, rc, root, cow);
4276 BUG_ON(ret);
4277 }
4278}
4279
4280/*
4281 * called before creating snapshot. it calculates metadata reservation
4282 * requried for relocating tree blocks in the snapshot
4283 */
4284void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
4285 struct btrfs_pending_snapshot *pending,
4286 u64 *bytes_to_reserve)
4287{
4288 struct btrfs_root *root;
4289 struct reloc_control *rc;
4290
4291 root = pending->root;
4292 if (!root->reloc_root)
4293 return;
4294
4295 rc = root->fs_info->reloc_ctl;
4296 if (!rc->merge_reloc_tree)
4297 return;
4298
4299 root = root->reloc_root;
4300 BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4301 /*
4302 * relocation is in the stage of merging trees. the space
4303 * used by merging a reloc tree is twice the size of
4304 * relocated tree nodes in the worst case. half for cowing
4305 * the reloc tree, half for cowing the fs tree. the space
4306 * used by cowing the reloc tree will be freed after the
4307 * tree is dropped. if we create snapshot, cowing the fs
4308 * tree may use more space than it frees. so we need
4309 * reserve extra space.
4310 */
4311 *bytes_to_reserve += rc->nodes_relocated;
4312}
4313
4314/*
4315 * called after snapshot is created. migrate block reservation
4316 * and create reloc root for the newly created snapshot
4317 */
4318void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4319 struct btrfs_pending_snapshot *pending)
4320{
4321 struct btrfs_root *root = pending->root;
4322 struct btrfs_root *reloc_root;
4323 struct btrfs_root *new_root;
4324 struct reloc_control *rc;
4325 int ret;
4326
4327 if (!root->reloc_root)
4328 return;
4329
4330 rc = root->fs_info->reloc_ctl;
4331 rc->merging_rsv_size += rc->nodes_relocated;
4332
4333 if (rc->merge_reloc_tree) {
4334 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4335 rc->block_rsv,
4336 rc->nodes_relocated);
4337 BUG_ON(ret);
4338 }
4339
4340 new_root = pending->snap;
4341 reloc_root = create_reloc_root(trans, root->reloc_root,
4342 new_root->root_key.objectid);
4343
4344 __add_reloc_root(reloc_root);
4345 new_root->reloc_root = reloc_root;
4346
4347 if (rc->create_reloc_tree) {
4348 ret = clone_backref_node(trans, rc, root, reloc_root);
4349 BUG_ON(ret);
4350 }
3814} 4351}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..6a1086e83ffc 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -181,7 +181,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
181int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) 181int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
182{ 182{
183 struct btrfs_root *dead_root; 183 struct btrfs_root *dead_root;
184 struct btrfs_item *item;
185 struct btrfs_root_item *ri; 184 struct btrfs_root_item *ri;
186 struct btrfs_key key; 185 struct btrfs_key key;
187 struct btrfs_key found_key; 186 struct btrfs_key found_key;
@@ -214,7 +213,6 @@ again:
214 nritems = btrfs_header_nritems(leaf); 213 nritems = btrfs_header_nritems(leaf);
215 slot = path->slots[0]; 214 slot = path->slots[0];
216 } 215 }
217 item = btrfs_item_nr(leaf, slot);
218 btrfs_item_key_to_cpu(leaf, &key, slot); 216 btrfs_item_key_to_cpu(leaf, &key, slot);
219 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) 217 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
220 goto next; 218 goto next;
@@ -259,6 +257,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
259 struct extent_buffer *leaf; 257 struct extent_buffer *leaf;
260 struct btrfs_path *path; 258 struct btrfs_path *path;
261 struct btrfs_key key; 259 struct btrfs_key key;
260 struct btrfs_key root_key;
261 struct btrfs_root *root;
262 int err = 0; 262 int err = 0;
263 int ret; 263 int ret;
264 264
@@ -270,6 +270,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
270 key.type = BTRFS_ORPHAN_ITEM_KEY; 270 key.type = BTRFS_ORPHAN_ITEM_KEY;
271 key.offset = 0; 271 key.offset = 0;
272 272
273 root_key.type = BTRFS_ROOT_ITEM_KEY;
274 root_key.offset = (u64)-1;
275
273 while (1) { 276 while (1) {
274 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); 277 ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
275 if (ret < 0) { 278 if (ret < 0) {
@@ -294,13 +297,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
294 key.type != BTRFS_ORPHAN_ITEM_KEY) 297 key.type != BTRFS_ORPHAN_ITEM_KEY)
295 break; 298 break;
296 299
297 ret = btrfs_find_dead_roots(tree_root, key.offset); 300 root_key.objectid = key.offset;
298 if (ret) { 301 key.offset++;
302
303 root = btrfs_read_fs_root_no_name(tree_root->fs_info,
304 &root_key);
305 if (!IS_ERR(root))
306 continue;
307
308 ret = PTR_ERR(root);
309 if (ret != -ENOENT) {
299 err = ret; 310 err = ret;
300 break; 311 break;
301 } 312 }
302 313
303 key.offset++; 314 ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
315 if (ret) {
316 err = ret;
317 break;
318 }
304 } 319 }
305 320
306 btrfs_free_path(path); 321 btrfs_free_path(path);
@@ -313,7 +328,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
313{ 328{
314 struct btrfs_path *path; 329 struct btrfs_path *path;
315 int ret; 330 int ret;
316 u32 refs;
317 struct btrfs_root_item *ri; 331 struct btrfs_root_item *ri;
318 struct extent_buffer *leaf; 332 struct extent_buffer *leaf;
319 333
@@ -327,8 +341,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
327 leaf = path->nodes[0]; 341 leaf = path->nodes[0];
328 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); 342 ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
329 343
330 refs = btrfs_disk_root_refs(leaf, ri);
331 BUG_ON(refs != 0);
332 ret = btrfs_del_item(trans, root, path); 344 ret = btrfs_del_item(trans, root, path);
333out: 345out:
334 btrfs_free_path(path); 346 btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8a1ea6e64575..b2130c46fdb5 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -38,6 +38,7 @@
38#include <linux/namei.h> 38#include <linux/namei.h>
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h>
41#include "compat.h" 42#include "compat.h"
42#include "ctree.h" 43#include "ctree.h"
43#include "disk-io.h" 44#include "disk-io.h"
@@ -53,6 +54,90 @@
53 54
54static const struct super_operations btrfs_super_ops; 55static const struct super_operations btrfs_super_ops;
55 56
57static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
58 char nbuf[16])
59{
60 char *errstr = NULL;
61
62 switch (errno) {
63 case -EIO:
64 errstr = "IO failure";
65 break;
66 case -ENOMEM:
67 errstr = "Out of memory";
68 break;
69 case -EROFS:
70 errstr = "Readonly filesystem";
71 break;
72 default:
73 if (nbuf) {
74 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
75 errstr = nbuf;
76 }
77 break;
78 }
79
80 return errstr;
81}
82
83static void __save_error_info(struct btrfs_fs_info *fs_info)
84{
85 /*
86 * today we only save the error info into ram. Long term we'll
87 * also send it down to the disk
88 */
89 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
90}
91
92/* NOTE:
93 * We move write_super stuff at umount in order to avoid deadlock
94 * for umount hold all lock.
95 */
96static void save_error_info(struct btrfs_fs_info *fs_info)
97{
98 __save_error_info(fs_info);
99}
100
101/* btrfs handle error by forcing the filesystem readonly */
102static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
103{
104 struct super_block *sb = fs_info->sb;
105
106 if (sb->s_flags & MS_RDONLY)
107 return;
108
109 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
110 sb->s_flags |= MS_RDONLY;
111 printk(KERN_INFO "btrfs is forced readonly\n");
112 }
113}
114
115/*
116 * __btrfs_std_error decodes expected errors from the caller and
117 * invokes the approciate error response.
118 */
119void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
120 unsigned int line, int errno)
121{
122 struct super_block *sb = fs_info->sb;
123 char nbuf[16];
124 const char *errstr;
125
126 /*
127 * Special case: if the error is EROFS, and we're already
128 * under MS_RDONLY, then it is safe here.
129 */
130 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
131 return;
132
133 errstr = btrfs_decode_error(fs_info, errno, nbuf);
134 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
135 sb->s_id, function, line, errstr);
136 save_error_info(fs_info);
137
138 btrfs_handle_error(fs_info);
139}
140
56static void btrfs_put_super(struct super_block *sb) 141static void btrfs_put_super(struct super_block *sb)
57{ 142{
58 struct btrfs_root *root = btrfs_sb(sb); 143 struct btrfs_root *root = btrfs_sb(sb);
@@ -60,30 +145,34 @@ static void btrfs_put_super(struct super_block *sb)
60 145
61 ret = close_ctree(root); 146 ret = close_ctree(root);
62 sb->s_fs_info = NULL; 147 sb->s_fs_info = NULL;
148
149 (void)ret; /* FIXME: need to fix VFS to return error? */
63} 150}
64 151
65enum { 152enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 153 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 154 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 155 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
69 Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 156 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
70 Opt_flushoncommit, 157 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
71 Opt_discard, Opt_err, 158 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, Opt_err,
72}; 159};
73 160
74static match_table_t tokens = { 161static match_table_t tokens = {
75 {Opt_degraded, "degraded"}, 162 {Opt_degraded, "degraded"},
76 {Opt_subvol, "subvol=%s"}, 163 {Opt_subvol, "subvol=%s"},
164 {Opt_subvolid, "subvolid=%d"},
77 {Opt_device, "device=%s"}, 165 {Opt_device, "device=%s"},
78 {Opt_nodatasum, "nodatasum"}, 166 {Opt_nodatasum, "nodatasum"},
79 {Opt_nodatacow, "nodatacow"}, 167 {Opt_nodatacow, "nodatacow"},
80 {Opt_nobarrier, "nobarrier"}, 168 {Opt_nobarrier, "nobarrier"},
81 {Opt_max_extent, "max_extent=%s"},
82 {Opt_max_inline, "max_inline=%s"}, 169 {Opt_max_inline, "max_inline=%s"},
83 {Opt_alloc_start, "alloc_start=%s"}, 170 {Opt_alloc_start, "alloc_start=%s"},
84 {Opt_thread_pool, "thread_pool=%d"}, 171 {Opt_thread_pool, "thread_pool=%d"},
85 {Opt_compress, "compress"}, 172 {Opt_compress, "compress"},
173 {Opt_compress_type, "compress=%s"},
86 {Opt_compress_force, "compress-force"}, 174 {Opt_compress_force, "compress-force"},
175 {Opt_compress_force_type, "compress-force=%s"},
87 {Opt_ssd, "ssd"}, 176 {Opt_ssd, "ssd"},
88 {Opt_ssd_spread, "ssd_spread"}, 177 {Opt_ssd_spread, "ssd_spread"},
89 {Opt_nossd, "nossd"}, 178 {Opt_nossd, "nossd"},
@@ -92,34 +181,12 @@ static match_table_t tokens = {
92 {Opt_flushoncommit, "flushoncommit"}, 181 {Opt_flushoncommit, "flushoncommit"},
93 {Opt_ratio, "metadata_ratio=%d"}, 182 {Opt_ratio, "metadata_ratio=%d"},
94 {Opt_discard, "discard"}, 183 {Opt_discard, "discard"},
184 {Opt_space_cache, "space_cache"},
185 {Opt_clear_cache, "clear_cache"},
186 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
95 {Opt_err, NULL}, 187 {Opt_err, NULL},
96}; 188};
97 189
98u64 btrfs_parse_size(char *str)
99{
100 u64 res;
101 int mult = 1;
102 char *end;
103 char last;
104
105 res = simple_strtoul(str, &end, 10);
106
107 last = end[0];
108 if (isalpha(last)) {
109 last = tolower(last);
110 switch (last) {
111 case 'g':
112 mult *= 1024;
113 case 'm':
114 mult *= 1024;
115 case 'k':
116 mult *= 1024;
117 }
118 res = res * mult;
119 }
120 return res;
121}
122
123/* 190/*
124 * Regular mount options parser. Everything that is needed only when 191 * Regular mount options parser. Everything that is needed only when
125 * reading in a new superblock is parsed here. 192 * reading in a new superblock is parsed here.
@@ -128,9 +195,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
128{ 195{
129 struct btrfs_fs_info *info = root->fs_info; 196 struct btrfs_fs_info *info = root->fs_info;
130 substring_t args[MAX_OPT_ARGS]; 197 substring_t args[MAX_OPT_ARGS];
131 char *p, *num; 198 char *p, *num, *orig;
132 int intarg; 199 int intarg;
133 int ret = 0; 200 int ret = 0;
201 char *compress_type;
202 bool compress_force = false;
134 203
135 if (!options) 204 if (!options)
136 return 0; 205 return 0;
@@ -143,6 +212,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
143 if (!options) 212 if (!options)
144 return -ENOMEM; 213 return -ENOMEM;
145 214
215 orig = options;
146 216
147 while ((p = strsep(&options, ",")) != NULL) { 217 while ((p = strsep(&options, ",")) != NULL) {
148 int token; 218 int token;
@@ -156,6 +226,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
156 btrfs_set_opt(info->mount_opt, DEGRADED); 226 btrfs_set_opt(info->mount_opt, DEGRADED);
157 break; 227 break;
158 case Opt_subvol: 228 case Opt_subvol:
229 case Opt_subvolid:
159 case Opt_device: 230 case Opt_device:
160 /* 231 /*
161 * These are parsed by btrfs_parse_early_options 232 * These are parsed by btrfs_parse_early_options
@@ -171,14 +242,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
171 btrfs_set_opt(info->mount_opt, NODATACOW); 242 btrfs_set_opt(info->mount_opt, NODATACOW);
172 btrfs_set_opt(info->mount_opt, NODATASUM); 243 btrfs_set_opt(info->mount_opt, NODATASUM);
173 break; 244 break;
174 case Opt_compress:
175 printk(KERN_INFO "btrfs: use compression\n");
176 btrfs_set_opt(info->mount_opt, COMPRESS);
177 break;
178 case Opt_compress_force: 245 case Opt_compress_force:
179 printk(KERN_INFO "btrfs: forcing compression\n"); 246 case Opt_compress_force_type:
180 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 247 compress_force = true;
248 case Opt_compress:
249 case Opt_compress_type:
250 if (token == Opt_compress ||
251 token == Opt_compress_force ||
252 strcmp(args[0].from, "zlib") == 0) {
253 compress_type = "zlib";
254 info->compress_type = BTRFS_COMPRESS_ZLIB;
255 } else if (strcmp(args[0].from, "lzo") == 0) {
256 compress_type = "lzo";
257 info->compress_type = BTRFS_COMPRESS_LZO;
258 } else {
259 ret = -EINVAL;
260 goto out;
261 }
262
181 btrfs_set_opt(info->mount_opt, COMPRESS); 263 btrfs_set_opt(info->mount_opt, COMPRESS);
264 if (compress_force) {
265 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
266 pr_info("btrfs: force %s compression\n",
267 compress_type);
268 } else
269 pr_info("btrfs: use %s compression\n",
270 compress_type);
182 break; 271 break;
183 case Opt_ssd: 272 case Opt_ssd:
184 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 273 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -210,22 +299,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
210 info->thread_pool_size); 299 info->thread_pool_size);
211 } 300 }
212 break; 301 break;
213 case Opt_max_extent:
214 num = match_strdup(&args[0]);
215 if (num) {
216 info->max_extent = btrfs_parse_size(num);
217 kfree(num);
218
219 info->max_extent = max_t(u64,
220 info->max_extent, root->sectorsize);
221 printk(KERN_INFO "btrfs: max_extent at %llu\n",
222 (unsigned long long)info->max_extent);
223 }
224 break;
225 case Opt_max_inline: 302 case Opt_max_inline:
226 num = match_strdup(&args[0]); 303 num = match_strdup(&args[0]);
227 if (num) { 304 if (num) {
228 info->max_inline = btrfs_parse_size(num); 305 info->max_inline = memparse(num, NULL);
229 kfree(num); 306 kfree(num);
230 307
231 if (info->max_inline) { 308 if (info->max_inline) {
@@ -240,7 +317,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
240 case Opt_alloc_start: 317 case Opt_alloc_start:
241 num = match_strdup(&args[0]); 318 num = match_strdup(&args[0]);
242 if (num) { 319 if (num) {
243 info->alloc_start = btrfs_parse_size(num); 320 info->alloc_start = memparse(num, NULL);
244 kfree(num); 321 kfree(num);
245 printk(KERN_INFO 322 printk(KERN_INFO
246 "btrfs: allocations start at %llu\n", 323 "btrfs: allocations start at %llu\n",
@@ -270,6 +347,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
270 case Opt_discard: 347 case Opt_discard:
271 btrfs_set_opt(info->mount_opt, DISCARD); 348 btrfs_set_opt(info->mount_opt, DISCARD);
272 break; 349 break;
350 case Opt_space_cache:
351 printk(KERN_INFO "btrfs: enabling disk space caching\n");
352 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
353 break;
354 case Opt_clear_cache:
355 printk(KERN_INFO "btrfs: force clearing of disk cache\n");
356 btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
357 break;
358 case Opt_user_subvol_rm_allowed:
359 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
360 break;
273 case Opt_err: 361 case Opt_err:
274 printk(KERN_INFO "btrfs: unrecognized mount option " 362 printk(KERN_INFO "btrfs: unrecognized mount option "
275 "'%s'\n", p); 363 "'%s'\n", p);
@@ -280,7 +368,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
280 } 368 }
281 } 369 }
282out: 370out:
283 kfree(options); 371 kfree(orig);
284 return ret; 372 return ret;
285} 373}
286 374
@@ -291,12 +379,13 @@ out:
291 * only when we need to allocate a new super block. 379 * only when we need to allocate a new super block.
292 */ 380 */
293static int btrfs_parse_early_options(const char *options, fmode_t flags, 381static int btrfs_parse_early_options(const char *options, fmode_t flags,
294 void *holder, char **subvol_name, 382 void *holder, char **subvol_name, u64 *subvol_objectid,
295 struct btrfs_fs_devices **fs_devices) 383 struct btrfs_fs_devices **fs_devices)
296{ 384{
297 substring_t args[MAX_OPT_ARGS]; 385 substring_t args[MAX_OPT_ARGS];
298 char *opts, *p; 386 char *opts, *p;
299 int error = 0; 387 int error = 0;
388 int intarg;
300 389
301 if (!options) 390 if (!options)
302 goto out; 391 goto out;
@@ -319,6 +408,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
319 case Opt_subvol: 408 case Opt_subvol:
320 *subvol_name = match_strdup(&args[0]); 409 *subvol_name = match_strdup(&args[0]);
321 break; 410 break;
411 case Opt_subvolid:
412 intarg = 0;
413 error = match_int(&args[0], &intarg);
414 if (!error) {
415 /* we want the original fs_tree */
416 if (!intarg)
417 *subvol_objectid =
418 BTRFS_FS_TREE_OBJECTID;
419 else
420 *subvol_objectid = intarg;
421 }
422 break;
322 case Opt_device: 423 case Opt_device:
323 error = btrfs_scan_one_device(match_strdup(&args[0]), 424 error = btrfs_scan_one_device(match_strdup(&args[0]),
324 flags, holder, fs_devices); 425 flags, holder, fs_devices);
@@ -346,13 +447,118 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
346 return error; 447 return error;
347} 448}
348 449
450static struct dentry *get_default_root(struct super_block *sb,
451 u64 subvol_objectid)
452{
453 struct btrfs_root *root = sb->s_fs_info;
454 struct btrfs_root *new_root;
455 struct btrfs_dir_item *di;
456 struct btrfs_path *path;
457 struct btrfs_key location;
458 struct inode *inode;
459 struct dentry *dentry;
460 u64 dir_id;
461 int new = 0;
462
463 /*
464 * We have a specific subvol we want to mount, just setup location and
465 * go look up the root.
466 */
467 if (subvol_objectid) {
468 location.objectid = subvol_objectid;
469 location.type = BTRFS_ROOT_ITEM_KEY;
470 location.offset = (u64)-1;
471 goto find_root;
472 }
473
474 path = btrfs_alloc_path();
475 if (!path)
476 return ERR_PTR(-ENOMEM);
477 path->leave_spinning = 1;
478
479 /*
480 * Find the "default" dir item which points to the root item that we
481 * will mount by default if we haven't been given a specific subvolume
482 * to mount.
483 */
484 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
485 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
486 if (IS_ERR(di))
487 return ERR_CAST(di);
488 if (!di) {
489 /*
490 * Ok the default dir item isn't there. This is weird since
491 * it's always been there, but don't freak out, just try and
492 * mount to root most subvolume.
493 */
494 btrfs_free_path(path);
495 dir_id = BTRFS_FIRST_FREE_OBJECTID;
496 new_root = root->fs_info->fs_root;
497 goto setup_root;
498 }
499
500 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
501 btrfs_free_path(path);
502
503find_root:
504 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
505 if (IS_ERR(new_root))
506 return ERR_CAST(new_root);
507
508 if (btrfs_root_refs(&new_root->root_item) == 0)
509 return ERR_PTR(-ENOENT);
510
511 dir_id = btrfs_root_dirid(&new_root->root_item);
512setup_root:
513 location.objectid = dir_id;
514 location.type = BTRFS_INODE_ITEM_KEY;
515 location.offset = 0;
516
517 inode = btrfs_iget(sb, &location, new_root, &new);
518 if (IS_ERR(inode))
519 return ERR_CAST(inode);
520
521 /*
522 * If we're just mounting the root most subvol put the inode and return
523 * a reference to the dentry. We will have already gotten a reference
524 * to the inode in btrfs_fill_super so we're good to go.
525 */
526 if (!new && sb->s_root->d_inode == inode) {
527 iput(inode);
528 return dget(sb->s_root);
529 }
530
531 if (new) {
532 const struct qstr name = { .name = "/", .len = 1 };
533
534 /*
535 * New inode, we need to make the dentry a sibling of s_root so
536 * everything gets cleaned up properly on unmount.
537 */
538 dentry = d_alloc(sb->s_root, &name);
539 if (!dentry) {
540 iput(inode);
541 return ERR_PTR(-ENOMEM);
542 }
543 d_splice_alias(inode, dentry);
544 } else {
545 /*
546 * We found the inode in cache, just find a dentry for it and
547 * put the reference to the inode we just got.
548 */
549 dentry = d_find_alias(inode);
550 iput(inode);
551 }
552
553 return dentry;
554}
555
349static int btrfs_fill_super(struct super_block *sb, 556static int btrfs_fill_super(struct super_block *sb,
350 struct btrfs_fs_devices *fs_devices, 557 struct btrfs_fs_devices *fs_devices,
351 void *data, int silent) 558 void *data, int silent)
352{ 559{
353 struct inode *inode; 560 struct inode *inode;
354 struct dentry *root_dentry; 561 struct dentry *root_dentry;
355 struct btrfs_super_block *disk_super;
356 struct btrfs_root *tree_root; 562 struct btrfs_root *tree_root;
357 struct btrfs_key key; 563 struct btrfs_key key;
358 int err; 564 int err;
@@ -360,6 +566,7 @@ static int btrfs_fill_super(struct super_block *sb,
360 sb->s_maxbytes = MAX_LFS_FILESIZE; 566 sb->s_maxbytes = MAX_LFS_FILESIZE;
361 sb->s_magic = BTRFS_SUPER_MAGIC; 567 sb->s_magic = BTRFS_SUPER_MAGIC;
362 sb->s_op = &btrfs_super_ops; 568 sb->s_op = &btrfs_super_ops;
569 sb->s_d_op = &btrfs_dentry_operations;
363 sb->s_export_op = &btrfs_export_ops; 570 sb->s_export_op = &btrfs_export_ops;
364 sb->s_xattr = btrfs_xattr_handlers; 571 sb->s_xattr = btrfs_xattr_handlers;
365 sb->s_time_gran = 1; 572 sb->s_time_gran = 1;
@@ -374,12 +581,11 @@ static int btrfs_fill_super(struct super_block *sb,
374 return PTR_ERR(tree_root); 581 return PTR_ERR(tree_root);
375 } 582 }
376 sb->s_fs_info = tree_root; 583 sb->s_fs_info = tree_root;
377 disk_super = &tree_root->fs_info->super_copy;
378 584
379 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 585 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
380 key.type = BTRFS_INODE_ITEM_KEY; 586 key.type = BTRFS_INODE_ITEM_KEY;
381 key.offset = 0; 587 key.offset = 0;
382 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); 588 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
383 if (IS_ERR(inode)) { 589 if (IS_ERR(inode)) {
384 err = PTR_ERR(inode); 590 err = PTR_ERR(inode);
385 goto fail_close; 591 goto fail_close;
@@ -391,12 +597,6 @@ static int btrfs_fill_super(struct super_block *sb,
391 err = -ENOMEM; 597 err = -ENOMEM;
392 goto fail_close; 598 goto fail_close;
393 } 599 }
394#if 0
395 /* this does the super kobj at the same time */
396 err = btrfs_sysfs_add_super(tree_root->fs_info);
397 if (err)
398 goto fail_close;
399#endif
400 600
401 sb->s_root = root_dentry; 601 sb->s_root = root_dentry;
402 602
@@ -422,7 +622,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
422 btrfs_start_delalloc_inodes(root, 0); 622 btrfs_start_delalloc_inodes(root, 0);
423 btrfs_wait_ordered_extents(root, 0, 0); 623 btrfs_wait_ordered_extents(root, 0, 0);
424 624
425 trans = btrfs_start_transaction(root, 1); 625 trans = btrfs_start_transaction(root, 0);
426 ret = btrfs_commit_transaction(trans, root); 626 ret = btrfs_commit_transaction(trans, root);
427 return ret; 627 return ret;
428} 628}
@@ -440,9 +640,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
440 seq_puts(seq, ",nodatacow"); 640 seq_puts(seq, ",nodatacow");
441 if (btrfs_test_opt(root, NOBARRIER)) 641 if (btrfs_test_opt(root, NOBARRIER))
442 seq_puts(seq, ",nobarrier"); 642 seq_puts(seq, ",nobarrier");
443 if (info->max_extent != (u64)-1)
444 seq_printf(seq, ",max_extent=%llu",
445 (unsigned long long)info->max_extent);
446 if (info->max_inline != 8192 * 1024) 643 if (info->max_inline != 8192 * 1024)
447 seq_printf(seq, ",max_inline=%llu", 644 seq_printf(seq, ",max_inline=%llu",
448 (unsigned long long)info->max_inline); 645 (unsigned long long)info->max_inline);
@@ -473,36 +670,54 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
473 670
474static int btrfs_test_super(struct super_block *s, void *data) 671static int btrfs_test_super(struct super_block *s, void *data)
475{ 672{
476 struct btrfs_fs_devices *test_fs_devices = data; 673 struct btrfs_root *test_root = data;
477 struct btrfs_root *root = btrfs_sb(s); 674 struct btrfs_root *root = btrfs_sb(s);
478 675
479 return root->fs_info->fs_devices == test_fs_devices; 676 /*
677 * If this super block is going away, return false as it
678 * can't match as an existing super block.
679 */
680 if (!atomic_read(&s->s_active))
681 return 0;
682 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
683}
684
685static int btrfs_set_super(struct super_block *s, void *data)
686{
687 s->s_fs_info = data;
688
689 return set_anon_super(s, data);
480} 690}
481 691
692
482/* 693/*
483 * Find a superblock for the given device / mount point. 694 * Find a superblock for the given device / mount point.
484 * 695 *
485 * Note: This is based on get_sb_bdev from fs/super.c with a few additions 696 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
486 * for multiple device setup. Make sure to keep it in sync. 697 * for multiple device setup. Make sure to keep it in sync.
487 */ 698 */
488static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 699static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
489 const char *dev_name, void *data, struct vfsmount *mnt) 700 const char *dev_name, void *data)
490{ 701{
491 char *subvol_name = NULL;
492 struct block_device *bdev = NULL; 702 struct block_device *bdev = NULL;
493 struct super_block *s; 703 struct super_block *s;
494 struct dentry *root; 704 struct dentry *root;
495 struct btrfs_fs_devices *fs_devices = NULL; 705 struct btrfs_fs_devices *fs_devices = NULL;
706 struct btrfs_root *tree_root = NULL;
707 struct btrfs_fs_info *fs_info = NULL;
496 fmode_t mode = FMODE_READ; 708 fmode_t mode = FMODE_READ;
709 char *subvol_name = NULL;
710 u64 subvol_objectid = 0;
497 int error = 0; 711 int error = 0;
498 712
499 if (!(flags & MS_RDONLY)) 713 if (!(flags & MS_RDONLY))
500 mode |= FMODE_WRITE; 714 mode |= FMODE_WRITE;
501 715
502 error = btrfs_parse_early_options(data, mode, fs_type, 716 error = btrfs_parse_early_options(data, mode, fs_type,
503 &subvol_name, &fs_devices); 717 &subvol_name, &subvol_objectid,
718 &fs_devices);
504 if (error) 719 if (error)
505 return error; 720 return ERR_PTR(error);
506 721
507 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); 722 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
508 if (error) 723 if (error)
@@ -517,8 +732,24 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
517 goto error_close_devices; 732 goto error_close_devices;
518 } 733 }
519 734
735 /*
736 * Setup a dummy root and fs_info for test/set super. This is because
737 * we don't actually fill this stuff out until open_ctree, but we need
738 * it for searching for existing supers, so this lets us do that and
739 * then open_ctree will properly initialize everything later.
740 */
741 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
742 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
743 if (!fs_info || !tree_root) {
744 error = -ENOMEM;
745 goto error_close_devices;
746 }
747 fs_info->tree_root = tree_root;
748 fs_info->fs_devices = fs_devices;
749 tree_root->fs_info = fs_info;
750
520 bdev = fs_devices->latest_bdev; 751 bdev = fs_devices->latest_bdev;
521 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); 752 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
522 if (IS_ERR(s)) 753 if (IS_ERR(s))
523 goto error_s; 754 goto error_s;
524 755
@@ -546,40 +777,49 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
546 s->s_flags |= MS_ACTIVE; 777 s->s_flags |= MS_ACTIVE;
547 } 778 }
548 779
549 if (!strcmp(subvol_name, ".")) 780 root = get_default_root(s, subvol_objectid);
550 root = dget(s->s_root); 781 if (IS_ERR(root)) {
551 else { 782 error = PTR_ERR(root);
552 mutex_lock(&s->s_root->d_inode->i_mutex); 783 deactivate_locked_super(s);
553 root = lookup_one_len(subvol_name, s->s_root, 784 goto error_free_subvol_name;
785 }
786 /* if they gave us a subvolume name bind mount into that */
787 if (strcmp(subvol_name, ".")) {
788 struct dentry *new_root;
789 mutex_lock(&root->d_inode->i_mutex);
790 new_root = lookup_one_len(subvol_name, root,
554 strlen(subvol_name)); 791 strlen(subvol_name));
555 mutex_unlock(&s->s_root->d_inode->i_mutex); 792 mutex_unlock(&root->d_inode->i_mutex);
556 793
557 if (IS_ERR(root)) { 794 if (IS_ERR(new_root)) {
795 dput(root);
558 deactivate_locked_super(s); 796 deactivate_locked_super(s);
559 error = PTR_ERR(root); 797 error = PTR_ERR(new_root);
560 goto error_free_subvol_name; 798 goto error_free_subvol_name;
561 } 799 }
562 if (!root->d_inode) { 800 if (!new_root->d_inode) {
563 dput(root); 801 dput(root);
802 dput(new_root);
564 deactivate_locked_super(s); 803 deactivate_locked_super(s);
565 error = -ENXIO; 804 error = -ENXIO;
566 goto error_free_subvol_name; 805 goto error_free_subvol_name;
567 } 806 }
807 dput(root);
808 root = new_root;
568 } 809 }
569 810
570 mnt->mnt_sb = s;
571 mnt->mnt_root = root;
572
573 kfree(subvol_name); 811 kfree(subvol_name);
574 return 0; 812 return root;
575 813
576error_s: 814error_s:
577 error = PTR_ERR(s); 815 error = PTR_ERR(s);
578error_close_devices: 816error_close_devices:
579 btrfs_close_devices(fs_devices); 817 btrfs_close_devices(fs_devices);
818 kfree(fs_info);
819 kfree(tree_root);
580error_free_subvol_name: 820error_free_subvol_name:
581 kfree(subvol_name); 821 kfree(subvol_name);
582 return error; 822 return ERR_PTR(error);
583} 823}
584 824
585static int btrfs_remount(struct super_block *sb, int *flags, char *data) 825static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -606,11 +846,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
606 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 846 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
607 return -EINVAL; 847 return -EINVAL;
608 848
609 /* recover relocation */ 849 ret = btrfs_cleanup_fs_roots(root->fs_info);
610 ret = btrfs_recover_relocation(root);
611 WARN_ON(ret); 850 WARN_ON(ret);
612 851
613 ret = btrfs_cleanup_fs_roots(root->fs_info); 852 /* recover relocation */
853 ret = btrfs_recover_relocation(root);
614 WARN_ON(ret); 854 WARN_ON(ret);
615 855
616 sb->s_flags &= ~MS_RDONLY; 856 sb->s_flags &= ~MS_RDONLY;
@@ -619,20 +859,167 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
619 return 0; 859 return 0;
620} 860}
621 861
862/*
863 * The helper to calc the free space on the devices that can be used to store
864 * file data.
865 */
866static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
867{
868 struct btrfs_fs_info *fs_info = root->fs_info;
869 struct btrfs_device_info *devices_info;
870 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
871 struct btrfs_device *device;
872 u64 skip_space;
873 u64 type;
874 u64 avail_space;
875 u64 used_space;
876 u64 min_stripe_size;
877 int min_stripes = 1;
878 int i = 0, nr_devices;
879 int ret;
880
881 nr_devices = fs_info->fs_devices->rw_devices;
882 BUG_ON(!nr_devices);
883
884 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
885 GFP_NOFS);
886 if (!devices_info)
887 return -ENOMEM;
888
889 /* calc min stripe number for data space alloction */
890 type = btrfs_get_alloc_profile(root, 1);
891 if (type & BTRFS_BLOCK_GROUP_RAID0)
892 min_stripes = 2;
893 else if (type & BTRFS_BLOCK_GROUP_RAID1)
894 min_stripes = 2;
895 else if (type & BTRFS_BLOCK_GROUP_RAID10)
896 min_stripes = 4;
897
898 if (type & BTRFS_BLOCK_GROUP_DUP)
899 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
900 else
901 min_stripe_size = BTRFS_STRIPE_LEN;
902
903 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
904 if (!device->in_fs_metadata)
905 continue;
906
907 avail_space = device->total_bytes - device->bytes_used;
908
909 /* align with stripe_len */
910 do_div(avail_space, BTRFS_STRIPE_LEN);
911 avail_space *= BTRFS_STRIPE_LEN;
912
913 /*
914 * In order to avoid overwritting the superblock on the drive,
915 * btrfs starts at an offset of at least 1MB when doing chunk
916 * allocation.
917 */
918 skip_space = 1024 * 1024;
919
920 /* user can set the offset in fs_info->alloc_start. */
921 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
922 device->total_bytes)
923 skip_space = max(fs_info->alloc_start, skip_space);
924
925 /*
926 * btrfs can not use the free space in [0, skip_space - 1],
927 * we must subtract it from the total. In order to implement
928 * it, we account the used space in this range first.
929 */
930 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
931 &used_space);
932 if (ret) {
933 kfree(devices_info);
934 return ret;
935 }
936
937 /* calc the free space in [0, skip_space - 1] */
938 skip_space -= used_space;
939
940 /*
941 * we can use the free space in [0, skip_space - 1], subtract
942 * it from the total.
943 */
944 if (avail_space && avail_space >= skip_space)
945 avail_space -= skip_space;
946 else
947 avail_space = 0;
948
949 if (avail_space < min_stripe_size)
950 continue;
951
952 devices_info[i].dev = device;
953 devices_info[i].max_avail = avail_space;
954
955 i++;
956 }
957
958 nr_devices = i;
959
960 btrfs_descending_sort_devices(devices_info, nr_devices);
961
962 i = nr_devices - 1;
963 avail_space = 0;
964 while (nr_devices >= min_stripes) {
965 if (devices_info[i].max_avail >= min_stripe_size) {
966 int j;
967 u64 alloc_size;
968
969 avail_space += devices_info[i].max_avail * min_stripes;
970 alloc_size = devices_info[i].max_avail;
971 for (j = i + 1 - min_stripes; j <= i; j++)
972 devices_info[j].max_avail -= alloc_size;
973 }
974 i--;
975 nr_devices--;
976 }
977
978 kfree(devices_info);
979 *free_bytes = avail_space;
980 return 0;
981}
982
622static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 983static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
623{ 984{
624 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 985 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
625 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 986 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
987 struct list_head *head = &root->fs_info->space_info;
988 struct btrfs_space_info *found;
989 u64 total_used = 0;
990 u64 total_free_data = 0;
626 int bits = dentry->d_sb->s_blocksize_bits; 991 int bits = dentry->d_sb->s_blocksize_bits;
627 __be32 *fsid = (__be32 *)root->fs_info->fsid; 992 __be32 *fsid = (__be32 *)root->fs_info->fsid;
993 int ret;
994
995 /* holding chunk_muext to avoid allocating new chunks */
996 mutex_lock(&root->fs_info->chunk_mutex);
997 rcu_read_lock();
998 list_for_each_entry_rcu(found, head, list) {
999 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
1000 total_free_data += found->disk_total - found->disk_used;
1001 total_free_data -=
1002 btrfs_account_ro_block_groups_free_space(found);
1003 }
1004
1005 total_used += found->disk_used;
1006 }
1007 rcu_read_unlock();
628 1008
629 buf->f_namelen = BTRFS_NAME_LEN; 1009 buf->f_namelen = BTRFS_NAME_LEN;
630 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1010 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
631 buf->f_bfree = buf->f_blocks - 1011 buf->f_bfree = buf->f_blocks - (total_used >> bits);
632 (btrfs_super_bytes_used(disk_super) >> bits);
633 buf->f_bavail = buf->f_bfree;
634 buf->f_bsize = dentry->d_sb->s_blocksize; 1012 buf->f_bsize = dentry->d_sb->s_blocksize;
635 buf->f_type = BTRFS_SUPER_MAGIC; 1013 buf->f_type = BTRFS_SUPER_MAGIC;
1014 buf->f_bavail = total_free_data;
1015 ret = btrfs_calc_avail_data_space(root, &total_free_data);
1016 if (ret) {
1017 mutex_unlock(&root->fs_info->chunk_mutex);
1018 return ret;
1019 }
1020 buf->f_bavail += total_free_data;
1021 buf->f_bavail = buf->f_bavail >> bits;
1022 mutex_unlock(&root->fs_info->chunk_mutex);
636 1023
637 /* We treat it as constant endianness (it doesn't matter _which_) 1024 /* We treat it as constant endianness (it doesn't matter _which_)
638 because we want the fsid to come out the same whether mounted 1025 because we want the fsid to come out the same whether mounted
@@ -649,7 +1036,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
649static struct file_system_type btrfs_fs_type = { 1036static struct file_system_type btrfs_fs_type = {
650 .owner = THIS_MODULE, 1037 .owner = THIS_MODULE,
651 .name = "btrfs", 1038 .name = "btrfs",
652 .get_sb = btrfs_get_sb, 1039 .mount = btrfs_mount,
653 .kill_sb = kill_anon_super, 1040 .kill_sb = kill_anon_super,
654 .fs_flags = FS_REQUIRES_DEV, 1041 .fs_flags = FS_REQUIRES_DEV,
655}; 1042};
@@ -700,7 +1087,7 @@ static int btrfs_unfreeze(struct super_block *sb)
700 1087
701static const struct super_operations btrfs_super_ops = { 1088static const struct super_operations btrfs_super_ops = {
702 .drop_inode = btrfs_drop_inode, 1089 .drop_inode = btrfs_drop_inode,
703 .delete_inode = btrfs_delete_inode, 1090 .evict_inode = btrfs_evict_inode,
704 .put_super = btrfs_put_super, 1091 .put_super = btrfs_put_super,
705 .sync_fs = btrfs_sync_fs, 1092 .sync_fs = btrfs_sync_fs,
706 .show_options = btrfs_show_options, 1093 .show_options = btrfs_show_options,
@@ -718,14 +1105,18 @@ static const struct file_operations btrfs_ctl_fops = {
718 .unlocked_ioctl = btrfs_control_ioctl, 1105 .unlocked_ioctl = btrfs_control_ioctl,
719 .compat_ioctl = btrfs_control_ioctl, 1106 .compat_ioctl = btrfs_control_ioctl,
720 .owner = THIS_MODULE, 1107 .owner = THIS_MODULE,
1108 .llseek = noop_llseek,
721}; 1109};
722 1110
723static struct miscdevice btrfs_misc = { 1111static struct miscdevice btrfs_misc = {
724 .minor = MISC_DYNAMIC_MINOR, 1112 .minor = BTRFS_MINOR,
725 .name = "btrfs-control", 1113 .name = "btrfs-control",
726 .fops = &btrfs_ctl_fops 1114 .fops = &btrfs_ctl_fops
727}; 1115};
728 1116
1117MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
1118MODULE_ALIAS("devname:btrfs-control");
1119
729static int btrfs_interface_init(void) 1120static int btrfs_interface_init(void)
730{ 1121{
731 return misc_register(&btrfs_misc); 1122 return misc_register(&btrfs_misc);
@@ -745,10 +1136,14 @@ static int __init init_btrfs_fs(void)
745 if (err) 1136 if (err)
746 return err; 1137 return err;
747 1138
748 err = btrfs_init_cachep(); 1139 err = btrfs_init_compress();
749 if (err) 1140 if (err)
750 goto free_sysfs; 1141 goto free_sysfs;
751 1142
1143 err = btrfs_init_cachep();
1144 if (err)
1145 goto free_compress;
1146
752 err = extent_io_init(); 1147 err = extent_io_init();
753 if (err) 1148 if (err)
754 goto free_cachep; 1149 goto free_cachep;
@@ -776,6 +1171,8 @@ free_extent_io:
776 extent_io_exit(); 1171 extent_io_exit();
777free_cachep: 1172free_cachep:
778 btrfs_destroy_cachep(); 1173 btrfs_destroy_cachep();
1174free_compress:
1175 btrfs_exit_compress();
779free_sysfs: 1176free_sysfs:
780 btrfs_exit_sysfs(); 1177 btrfs_exit_sysfs();
781 return err; 1178 return err;
@@ -790,7 +1187,7 @@ static void __exit exit_btrfs_fs(void)
790 unregister_filesystem(&btrfs_fs_type); 1187 unregister_filesystem(&btrfs_fs_type);
791 btrfs_exit_sysfs(); 1188 btrfs_exit_sysfs();
792 btrfs_cleanup_fs_uuids(); 1189 btrfs_cleanup_fs_uuids();
793 btrfs_zlib_exit(); 1190 btrfs_exit_compress();
794} 1191}
795 1192
796module_init(init_btrfs_fs) 1193module_init(init_btrfs_fs)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index a240b6fa81df..4ce16ef702a3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -164,12 +164,12 @@ static void btrfs_root_release(struct kobject *kobj)
164 complete(&root->kobj_unregister); 164 complete(&root->kobj_unregister);
165} 165}
166 166
167static struct sysfs_ops btrfs_super_attr_ops = { 167static const struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show, 168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store, 169 .store = btrfs_super_attr_store,
170}; 170};
171 171
172static struct sysfs_ops btrfs_root_attr_ops = { 172static const struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show, 173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store, 174 .store = btrfs_root_attr_store,
175}; 175};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index b2acc79f1b34..bae5c7b8bbe2 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20#include <linux/sched.h> 21#include <linux/sched.h>
21#include <linux/writeback.h> 22#include <linux/writeback.h>
22#include <linux/pagemap.h> 23#include <linux/pagemap.h>
@@ -69,7 +70,7 @@ static noinline int join_transaction(struct btrfs_root *root)
69 cur_trans->commit_done = 0; 70 cur_trans->commit_done = 0;
70 cur_trans->start_time = get_seconds(); 71 cur_trans->start_time = get_seconds();
71 72
72 cur_trans->delayed_refs.root.rb_node = NULL; 73 cur_trans->delayed_refs.root = RB_ROOT;
73 cur_trans->delayed_refs.num_entries = 0; 74 cur_trans->delayed_refs.num_entries = 0;
74 cur_trans->delayed_refs.num_heads_ready = 0; 75 cur_trans->delayed_refs.num_heads_ready = 0;
75 cur_trans->delayed_refs.num_heads = 0; 76 cur_trans->delayed_refs.num_heads = 0;
@@ -147,18 +148,13 @@ static void wait_current_trans(struct btrfs_root *root)
147 while (1) { 148 while (1) {
148 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 149 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
149 TASK_UNINTERRUPTIBLE); 150 TASK_UNINTERRUPTIBLE);
150 if (cur_trans->blocked) { 151 if (!cur_trans->blocked)
151 mutex_unlock(&root->fs_info->trans_mutex);
152 schedule();
153 mutex_lock(&root->fs_info->trans_mutex);
154 finish_wait(&root->fs_info->transaction_wait,
155 &wait);
156 } else {
157 finish_wait(&root->fs_info->transaction_wait,
158 &wait);
159 break; 152 break;
160 } 153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
161 } 156 }
157 finish_wait(&root->fs_info->transaction_wait, &wait);
162 put_transaction(cur_trans); 158 put_transaction(cur_trans);
163 } 159 }
164} 160}
@@ -167,56 +163,103 @@ enum btrfs_trans_type {
167 TRANS_START, 163 TRANS_START,
168 TRANS_JOIN, 164 TRANS_JOIN,
169 TRANS_USERSPACE, 165 TRANS_USERSPACE,
166 TRANS_JOIN_NOLOCK,
170}; 167};
171 168
172static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, 169static int may_wait_transaction(struct btrfs_root *root, int type)
173 int num_blocks, int type)
174{ 170{
175 struct btrfs_trans_handle *h =
176 kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
177 int ret;
178
179 mutex_lock(&root->fs_info->trans_mutex);
180 if (!root->fs_info->log_root_recovering && 171 if (!root->fs_info->log_root_recovering &&
181 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || 172 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
182 type == TRANS_USERSPACE)) 173 type == TRANS_USERSPACE))
174 return 1;
175 return 0;
176}
177
178static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
179 u64 num_items, int type)
180{
181 struct btrfs_trans_handle *h;
182 struct btrfs_transaction *cur_trans;
183 int ret;
184
185 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
186 return ERR_PTR(-EROFS);
187again:
188 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
189 if (!h)
190 return ERR_PTR(-ENOMEM);
191
192 if (type != TRANS_JOIN_NOLOCK)
193 mutex_lock(&root->fs_info->trans_mutex);
194 if (may_wait_transaction(root, type))
183 wait_current_trans(root); 195 wait_current_trans(root);
196
184 ret = join_transaction(root); 197 ret = join_transaction(root);
185 BUG_ON(ret); 198 BUG_ON(ret);
186 199
187 h->transid = root->fs_info->running_transaction->transid; 200 cur_trans = root->fs_info->running_transaction;
188 h->transaction = root->fs_info->running_transaction; 201 cur_trans->use_count++;
189 h->blocks_reserved = num_blocks; 202 if (type != TRANS_JOIN_NOLOCK)
203 mutex_unlock(&root->fs_info->trans_mutex);
204
205 h->transid = cur_trans->transid;
206 h->transaction = cur_trans;
190 h->blocks_used = 0; 207 h->blocks_used = 0;
191 h->block_group = 0; 208 h->block_group = 0;
192 h->alloc_exclude_nr = 0; 209 h->bytes_reserved = 0;
193 h->alloc_exclude_start = 0;
194 h->delayed_ref_updates = 0; 210 h->delayed_ref_updates = 0;
211 h->block_rsv = NULL;
195 212
196 if (!current->journal_info && type != TRANS_USERSPACE) 213 smp_mb();
197 current->journal_info = h; 214 if (cur_trans->blocked && may_wait_transaction(root, type)) {
215 btrfs_commit_transaction(h, root);
216 goto again;
217 }
218
219 if (num_items > 0) {
220 ret = btrfs_trans_reserve_metadata(h, root, num_items);
221 if (ret == -EAGAIN) {
222 btrfs_commit_transaction(h, root);
223 goto again;
224 }
225 if (ret < 0) {
226 btrfs_end_transaction(h, root);
227 return ERR_PTR(ret);
228 }
229 }
198 230
199 root->fs_info->running_transaction->use_count++; 231 if (type != TRANS_JOIN_NOLOCK)
232 mutex_lock(&root->fs_info->trans_mutex);
200 record_root_in_trans(h, root); 233 record_root_in_trans(h, root);
201 mutex_unlock(&root->fs_info->trans_mutex); 234 if (type != TRANS_JOIN_NOLOCK)
235 mutex_unlock(&root->fs_info->trans_mutex);
236
237 if (!current->journal_info && type != TRANS_USERSPACE)
238 current->journal_info = h;
202 return h; 239 return h;
203} 240}
204 241
205struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 242struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
206 int num_blocks) 243 int num_items)
207{ 244{
208 return start_transaction(root, num_blocks, TRANS_START); 245 return start_transaction(root, num_items, TRANS_START);
209} 246}
210struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 247struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
211 int num_blocks) 248 int num_blocks)
212{ 249{
213 return start_transaction(root, num_blocks, TRANS_JOIN); 250 return start_transaction(root, 0, TRANS_JOIN);
251}
252
253struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
254 int num_blocks)
255{
256 return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
214} 257}
215 258
216struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 259struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
217 int num_blocks) 260 int num_blocks)
218{ 261{
219 return start_transaction(r, num_blocks, TRANS_USERSPACE); 262 return start_transaction(r, 0, TRANS_USERSPACE);
220} 263}
221 264
222/* wait for a transaction commit to be fully complete */ 265/* wait for a transaction commit to be fully complete */
@@ -239,6 +282,58 @@ static noinline int wait_for_commit(struct btrfs_root *root,
239 return 0; 282 return 0;
240} 283}
241 284
285int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
286{
287 struct btrfs_transaction *cur_trans = NULL, *t;
288 int ret;
289
290 mutex_lock(&root->fs_info->trans_mutex);
291
292 ret = 0;
293 if (transid) {
294 if (transid <= root->fs_info->last_trans_committed)
295 goto out_unlock;
296
297 /* find specified transaction */
298 list_for_each_entry(t, &root->fs_info->trans_list, list) {
299 if (t->transid == transid) {
300 cur_trans = t;
301 break;
302 }
303 if (t->transid > transid)
304 break;
305 }
306 ret = -EINVAL;
307 if (!cur_trans)
308 goto out_unlock; /* bad transid */
309 } else {
310 /* find newest transaction that is committing | committed */
311 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
312 list) {
313 if (t->in_commit) {
314 if (t->commit_done)
315 goto out_unlock;
316 cur_trans = t;
317 break;
318 }
319 }
320 if (!cur_trans)
321 goto out_unlock; /* nothing committing|committed */
322 }
323
324 cur_trans->use_count++;
325 mutex_unlock(&root->fs_info->trans_mutex);
326
327 wait_for_commit(root, cur_trans);
328
329 mutex_lock(&root->fs_info->trans_mutex);
330 put_transaction(cur_trans);
331 ret = 0;
332out_unlock:
333 mutex_unlock(&root->fs_info->trans_mutex);
334 return ret;
335}
336
242#if 0 337#if 0
243/* 338/*
244 * rate limit against the drop_snapshot code. This helps to slow down new 339 * rate limit against the drop_snapshot code. This helps to slow down new
@@ -290,10 +385,36 @@ void btrfs_throttle(struct btrfs_root *root)
290 mutex_unlock(&root->fs_info->trans_mutex); 385 mutex_unlock(&root->fs_info->trans_mutex);
291} 386}
292 387
388static int should_end_transaction(struct btrfs_trans_handle *trans,
389 struct btrfs_root *root)
390{
391 int ret;
392 ret = btrfs_block_rsv_check(trans, root,
393 &root->fs_info->global_block_rsv, 0, 5);
394 return ret ? 1 : 0;
395}
396
397int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
398 struct btrfs_root *root)
399{
400 struct btrfs_transaction *cur_trans = trans->transaction;
401 int updates;
402
403 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
404 return 1;
405
406 updates = trans->delayed_ref_updates;
407 trans->delayed_ref_updates = 0;
408 if (updates)
409 btrfs_run_delayed_refs(trans, root, updates);
410
411 return should_end_transaction(trans, root);
412}
413
293static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 414static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
294 struct btrfs_root *root, int throttle) 415 struct btrfs_root *root, int throttle, int lock)
295{ 416{
296 struct btrfs_transaction *cur_trans; 417 struct btrfs_transaction *cur_trans = trans->transaction;
297 struct btrfs_fs_info *info = root->fs_info; 418 struct btrfs_fs_info *info = root->fs_info;
298 int count = 0; 419 int count = 0;
299 420
@@ -317,16 +438,31 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
317 count++; 438 count++;
318 } 439 }
319 440
320 mutex_lock(&info->trans_mutex); 441 btrfs_trans_release_metadata(trans, root);
321 cur_trans = info->running_transaction; 442
322 WARN_ON(cur_trans != trans->transaction); 443 if (lock && !root->fs_info->open_ioctl_trans &&
444 should_end_transaction(trans, root))
445 trans->transaction->blocked = 1;
446
447 if (lock && cur_trans->blocked && !cur_trans->in_commit) {
448 if (throttle)
449 return btrfs_commit_transaction(trans, root);
450 else
451 wake_up_process(info->transaction_kthread);
452 }
453
454 if (lock)
455 mutex_lock(&info->trans_mutex);
456 WARN_ON(cur_trans != info->running_transaction);
323 WARN_ON(cur_trans->num_writers < 1); 457 WARN_ON(cur_trans->num_writers < 1);
324 cur_trans->num_writers--; 458 cur_trans->num_writers--;
325 459
460 smp_mb();
326 if (waitqueue_active(&cur_trans->writer_wait)) 461 if (waitqueue_active(&cur_trans->writer_wait))
327 wake_up(&cur_trans->writer_wait); 462 wake_up(&cur_trans->writer_wait);
328 put_transaction(cur_trans); 463 put_transaction(cur_trans);
329 mutex_unlock(&info->trans_mutex); 464 if (lock)
465 mutex_unlock(&info->trans_mutex);
330 466
331 if (current->journal_info == trans) 467 if (current->journal_info == trans)
332 current->journal_info = NULL; 468 current->journal_info = NULL;
@@ -342,13 +478,19 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
342int btrfs_end_transaction(struct btrfs_trans_handle *trans, 478int btrfs_end_transaction(struct btrfs_trans_handle *trans,
343 struct btrfs_root *root) 479 struct btrfs_root *root)
344{ 480{
345 return __btrfs_end_transaction(trans, root, 0); 481 return __btrfs_end_transaction(trans, root, 0, 1);
346} 482}
347 483
348int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 484int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
349 struct btrfs_root *root) 485 struct btrfs_root *root)
350{ 486{
351 return __btrfs_end_transaction(trans, root, 1); 487 return __btrfs_end_transaction(trans, root, 1, 1);
488}
489
490int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
491 struct btrfs_root *root)
492{
493 return __btrfs_end_transaction(trans, root, 0, 0);
352} 494}
353 495
354/* 496/*
@@ -607,6 +749,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
607 749
608 btrfs_free_log(trans, root); 750 btrfs_free_log(trans, root);
609 btrfs_update_reloc_root(trans, root); 751 btrfs_update_reloc_root(trans, root);
752 btrfs_orphan_commit_root(trans, root);
610 753
611 if (root->commit_root != root->node) { 754 if (root->commit_root != root->node) {
612 switch_commit_root(root); 755 switch_commit_root(root);
@@ -631,30 +774,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
631int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) 774int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
632{ 775{
633 struct btrfs_fs_info *info = root->fs_info; 776 struct btrfs_fs_info *info = root->fs_info;
634 int ret;
635 struct btrfs_trans_handle *trans; 777 struct btrfs_trans_handle *trans;
778 int ret;
636 unsigned long nr; 779 unsigned long nr;
637 780
638 smp_mb(); 781 if (xchg(&root->defrag_running, 1))
639 if (root->defrag_running)
640 return 0; 782 return 0;
641 trans = btrfs_start_transaction(root, 1); 783
642 while (1) { 784 while (1) {
643 root->defrag_running = 1; 785 trans = btrfs_start_transaction(root, 0);
786 if (IS_ERR(trans))
787 return PTR_ERR(trans);
788
644 ret = btrfs_defrag_leaves(trans, root, cacheonly); 789 ret = btrfs_defrag_leaves(trans, root, cacheonly);
790
645 nr = trans->blocks_used; 791 nr = trans->blocks_used;
646 btrfs_end_transaction(trans, root); 792 btrfs_end_transaction(trans, root);
647 btrfs_btree_balance_dirty(info->tree_root, nr); 793 btrfs_btree_balance_dirty(info->tree_root, nr);
648 cond_resched(); 794 cond_resched();
649 795
650 trans = btrfs_start_transaction(root, 1);
651 if (root->fs_info->closing || ret != -EAGAIN) 796 if (root->fs_info->closing || ret != -EAGAIN)
652 break; 797 break;
653 } 798 }
654 root->defrag_running = 0; 799 root->defrag_running = 0;
655 smp_mb(); 800 return ret;
656 btrfs_end_transaction(trans, root);
657 return 0;
658} 801}
659 802
660#if 0 803#if 0
@@ -760,28 +903,80 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
760 struct btrfs_root_item *new_root_item; 903 struct btrfs_root_item *new_root_item;
761 struct btrfs_root *tree_root = fs_info->tree_root; 904 struct btrfs_root *tree_root = fs_info->tree_root;
762 struct btrfs_root *root = pending->root; 905 struct btrfs_root *root = pending->root;
906 struct btrfs_root *parent_root;
907 struct inode *parent_inode;
908 struct dentry *parent;
909 struct dentry *dentry;
763 struct extent_buffer *tmp; 910 struct extent_buffer *tmp;
764 struct extent_buffer *old; 911 struct extent_buffer *old;
765 int ret; 912 int ret;
913 u64 to_reserve = 0;
914 u64 index = 0;
766 u64 objectid; 915 u64 objectid;
916 u64 root_flags;
767 917
768 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 918 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
769 if (!new_root_item) { 919 if (!new_root_item) {
770 ret = -ENOMEM; 920 pending->error = -ENOMEM;
771 goto fail; 921 goto fail;
772 } 922 }
923
773 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 924 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
774 if (ret) 925 if (ret) {
926 pending->error = ret;
775 goto fail; 927 goto fail;
928 }
929
930 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
931 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
932
933 if (to_reserve > 0) {
934 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
935 to_reserve);
936 if (ret) {
937 pending->error = ret;
938 goto fail;
939 }
940 }
941
942 key.objectid = objectid;
943 key.offset = (u64)-1;
944 key.type = BTRFS_ROOT_ITEM_KEY;
945
946 trans->block_rsv = &pending->block_rsv;
947
948 dentry = pending->dentry;
949 parent = dget_parent(dentry);
950 parent_inode = parent->d_inode;
951 parent_root = BTRFS_I(parent_inode)->root;
952 record_root_in_trans(trans, parent_root);
953
954 /*
955 * insert the directory item
956 */
957 ret = btrfs_set_inode_index(parent_inode, &index);
958 BUG_ON(ret);
959 ret = btrfs_insert_dir_item(trans, parent_root,
960 dentry->d_name.name, dentry->d_name.len,
961 parent_inode->i_ino, &key,
962 BTRFS_FT_DIR, index);
963 BUG_ON(ret);
964
965 btrfs_i_size_write(parent_inode, parent_inode->i_size +
966 dentry->d_name.len * 2);
967 ret = btrfs_update_inode(trans, parent_root, parent_inode);
968 BUG_ON(ret);
776 969
777 record_root_in_trans(trans, root); 970 record_root_in_trans(trans, root);
778 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 971 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
779 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 972 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
780 973
781 key.objectid = objectid; 974 root_flags = btrfs_root_flags(new_root_item);
782 /* record when the snapshot was created in key.offset */ 975 if (pending->readonly)
783 key.offset = trans->transid; 976 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
784 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); 977 else
978 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
979 btrfs_set_root_flags(new_root_item, root_flags);
785 980
786 old = btrfs_lock_root_node(root); 981 old = btrfs_lock_root_node(root);
787 btrfs_cow_block(trans, root, old, NULL, 0, &old); 982 btrfs_cow_block(trans, root, old, NULL, 0, &old);
@@ -792,62 +987,33 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
792 free_extent_buffer(old); 987 free_extent_buffer(old);
793 988
794 btrfs_set_root_node(new_root_item, tmp); 989 btrfs_set_root_node(new_root_item, tmp);
795 ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, 990 /* record when the snapshot was created in key.offset */
796 new_root_item); 991 key.offset = trans->transid;
992 ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
797 btrfs_tree_unlock(tmp); 993 btrfs_tree_unlock(tmp);
798 free_extent_buffer(tmp); 994 free_extent_buffer(tmp);
799 if (ret) 995 BUG_ON(ret);
800 goto fail;
801
802 key.offset = (u64)-1;
803 memcpy(&pending->root_key, &key, sizeof(key));
804fail:
805 kfree(new_root_item);
806 return ret;
807}
808
809static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info,
810 struct btrfs_pending_snapshot *pending)
811{
812 int ret;
813 int namelen;
814 u64 index = 0;
815 struct btrfs_trans_handle *trans;
816 struct inode *parent_inode;
817 struct btrfs_root *parent_root;
818
819 parent_inode = pending->dentry->d_parent->d_inode;
820 parent_root = BTRFS_I(parent_inode)->root;
821 trans = btrfs_join_transaction(parent_root, 1);
822 996
823 /* 997 /*
824 * insert the directory item 998 * insert root back/forward references
825 */ 999 */
826 namelen = strlen(pending->name); 1000 ret = btrfs_add_root_ref(trans, tree_root, objectid,
827 ret = btrfs_set_inode_index(parent_inode, &index);
828 ret = btrfs_insert_dir_item(trans, parent_root,
829 pending->name, namelen,
830 parent_inode->i_ino,
831 &pending->root_key, BTRFS_FT_DIR, index);
832
833 if (ret)
834 goto fail;
835
836 btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
837 ret = btrfs_update_inode(trans, parent_root, parent_inode);
838 BUG_ON(ret);
839
840 ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
841 pending->root_key.objectid,
842 parent_root->root_key.objectid, 1001 parent_root->root_key.objectid,
843 parent_inode->i_ino, index, pending->name, 1002 parent_inode->i_ino, index,
844 namelen); 1003 dentry->d_name.name, dentry->d_name.len);
845
846 BUG_ON(ret); 1004 BUG_ON(ret);
1005 dput(parent);
847 1006
1007 key.offset = (u64)-1;
1008 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
1009 BUG_ON(IS_ERR(pending->snap));
1010
1011 btrfs_reloc_post_snapshot(trans, pending);
1012 btrfs_orphan_post_snapshot(trans, pending);
848fail: 1013fail:
849 btrfs_end_transaction(trans, fs_info->fs_root); 1014 kfree(new_root_item);
850 return ret; 1015 btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
1016 return 0;
851} 1017}
852 1018
853/* 1019/*
@@ -867,25 +1033,6 @@ static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
867 return 0; 1033 return 0;
868} 1034}
869 1035
870static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
871 struct btrfs_fs_info *fs_info)
872{
873 struct btrfs_pending_snapshot *pending;
874 struct list_head *head = &trans->transaction->pending_snapshots;
875 int ret;
876
877 while (!list_empty(head)) {
878 pending = list_entry(head->next,
879 struct btrfs_pending_snapshot, list);
880 ret = finish_pending_snapshot(fs_info, pending);
881 BUG_ON(ret);
882 list_del(&pending->list);
883 kfree(pending->name);
884 kfree(pending);
885 }
886 return 0;
887}
888
889static void update_super_roots(struct btrfs_root *root) 1036static void update_super_roots(struct btrfs_root *root)
890{ 1037{
891 struct btrfs_root_item *root_item; 1038 struct btrfs_root_item *root_item;
@@ -902,6 +1049,8 @@ static void update_super_roots(struct btrfs_root *root)
902 super->root = root_item->bytenr; 1049 super->root = root_item->bytenr;
903 super->generation = root_item->generation; 1050 super->generation = root_item->generation;
904 super->root_level = root_item->level; 1051 super->root_level = root_item->level;
1052 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
1053 super->cache_generation = root_item->generation;
905} 1054}
906 1055
907int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1056int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -914,11 +1063,137 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
914 return ret; 1063 return ret;
915} 1064}
916 1065
1066int btrfs_transaction_blocked(struct btrfs_fs_info *info)
1067{
1068 int ret = 0;
1069 spin_lock(&info->new_trans_lock);
1070 if (info->running_transaction)
1071 ret = info->running_transaction->blocked;
1072 spin_unlock(&info->new_trans_lock);
1073 return ret;
1074}
1075
1076/*
1077 * wait for the current transaction commit to start and block subsequent
1078 * transaction joins
1079 */
1080static void wait_current_trans_commit_start(struct btrfs_root *root,
1081 struct btrfs_transaction *trans)
1082{
1083 DEFINE_WAIT(wait);
1084
1085 if (trans->in_commit)
1086 return;
1087
1088 while (1) {
1089 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
1090 TASK_UNINTERRUPTIBLE);
1091 if (trans->in_commit) {
1092 finish_wait(&root->fs_info->transaction_blocked_wait,
1093 &wait);
1094 break;
1095 }
1096 mutex_unlock(&root->fs_info->trans_mutex);
1097 schedule();
1098 mutex_lock(&root->fs_info->trans_mutex);
1099 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1100 }
1101}
1102
1103/*
1104 * wait for the current transaction to start and then become unblocked.
1105 * caller holds ref.
1106 */
1107static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1108 struct btrfs_transaction *trans)
1109{
1110 DEFINE_WAIT(wait);
1111
1112 if (trans->commit_done || (trans->in_commit && !trans->blocked))
1113 return;
1114
1115 while (1) {
1116 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
1117 TASK_UNINTERRUPTIBLE);
1118 if (trans->commit_done ||
1119 (trans->in_commit && !trans->blocked)) {
1120 finish_wait(&root->fs_info->transaction_wait,
1121 &wait);
1122 break;
1123 }
1124 mutex_unlock(&root->fs_info->trans_mutex);
1125 schedule();
1126 mutex_lock(&root->fs_info->trans_mutex);
1127 finish_wait(&root->fs_info->transaction_wait,
1128 &wait);
1129 }
1130}
1131
1132/*
1133 * commit transactions asynchronously. once btrfs_commit_transaction_async
1134 * returns, any subsequent transaction will not be allowed to join.
1135 */
1136struct btrfs_async_commit {
1137 struct btrfs_trans_handle *newtrans;
1138 struct btrfs_root *root;
1139 struct delayed_work work;
1140};
1141
1142static void do_async_commit(struct work_struct *work)
1143{
1144 struct btrfs_async_commit *ac =
1145 container_of(work, struct btrfs_async_commit, work.work);
1146
1147 btrfs_commit_transaction(ac->newtrans, ac->root);
1148 kfree(ac);
1149}
1150
1151int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1152 struct btrfs_root *root,
1153 int wait_for_unblock)
1154{
1155 struct btrfs_async_commit *ac;
1156 struct btrfs_transaction *cur_trans;
1157
1158 ac = kmalloc(sizeof(*ac), GFP_NOFS);
1159 BUG_ON(!ac);
1160
1161 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1162 ac->root = root;
1163 ac->newtrans = btrfs_join_transaction(root, 0);
1164
1165 /* take transaction reference */
1166 mutex_lock(&root->fs_info->trans_mutex);
1167 cur_trans = trans->transaction;
1168 cur_trans->use_count++;
1169 mutex_unlock(&root->fs_info->trans_mutex);
1170
1171 btrfs_end_transaction(trans, root);
1172 schedule_delayed_work(&ac->work, 0);
1173
1174 /* wait for transaction to start and unblock */
1175 mutex_lock(&root->fs_info->trans_mutex);
1176 if (wait_for_unblock)
1177 wait_current_trans_commit_start_and_unblock(root, cur_trans);
1178 else
1179 wait_current_trans_commit_start(root, cur_trans);
1180 put_transaction(cur_trans);
1181 mutex_unlock(&root->fs_info->trans_mutex);
1182
1183 return 0;
1184}
1185
1186/*
1187 * btrfs_transaction state sequence:
1188 * in_commit = 0, blocked = 0 (initial)
1189 * in_commit = 1, blocked = 1
1190 * blocked = 0
1191 * commit_done = 1
1192 */
917int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1193int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
918 struct btrfs_root *root) 1194 struct btrfs_root *root)
919{ 1195{
920 unsigned long joined = 0; 1196 unsigned long joined = 0;
921 unsigned long timeout = 1;
922 struct btrfs_transaction *cur_trans; 1197 struct btrfs_transaction *cur_trans;
923 struct btrfs_transaction *prev_trans = NULL; 1198 struct btrfs_transaction *prev_trans = NULL;
924 DEFINE_WAIT(wait); 1199 DEFINE_WAIT(wait);
@@ -935,6 +1210,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
935 ret = btrfs_run_delayed_refs(trans, root, 0); 1210 ret = btrfs_run_delayed_refs(trans, root, 0);
936 BUG_ON(ret); 1211 BUG_ON(ret);
937 1212
1213 btrfs_trans_release_metadata(trans, root);
1214
938 cur_trans = trans->transaction; 1215 cur_trans = trans->transaction;
939 /* 1216 /*
940 * set the flushing flag so procs in this transaction have to 1217 * set the flushing flag so procs in this transaction have to
@@ -963,6 +1240,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
963 1240
964 trans->transaction->in_commit = 1; 1241 trans->transaction->in_commit = 1;
965 trans->transaction->blocked = 1; 1242 trans->transaction->blocked = 1;
1243 wake_up(&root->fs_info->transaction_blocked_wait);
1244
966 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1245 if (cur_trans->list.prev != &root->fs_info->trans_list) {
967 prev_trans = list_entry(cur_trans->list.prev, 1246 prev_trans = list_entry(cur_trans->list.prev,
968 struct btrfs_transaction, list); 1247 struct btrfs_transaction, list);
@@ -987,23 +1266,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
987 snap_pending = 1; 1266 snap_pending = 1;
988 1267
989 WARN_ON(cur_trans != trans->transaction); 1268 WARN_ON(cur_trans != trans->transaction);
990 prepare_to_wait(&cur_trans->writer_wait, &wait,
991 TASK_UNINTERRUPTIBLE);
992
993 if (cur_trans->num_writers > 1)
994 timeout = MAX_SCHEDULE_TIMEOUT;
995 else if (should_grow)
996 timeout = 1;
997
998 mutex_unlock(&root->fs_info->trans_mutex); 1269 mutex_unlock(&root->fs_info->trans_mutex);
999 1270
1000 if (flush_on_commit) { 1271 if (flush_on_commit || snap_pending) {
1001 btrfs_start_delalloc_inodes(root, 1); 1272 btrfs_start_delalloc_inodes(root, 1);
1002 ret = btrfs_wait_ordered_extents(root, 0, 1); 1273 ret = btrfs_wait_ordered_extents(root, 0, 1);
1003 BUG_ON(ret); 1274 BUG_ON(ret);
1004 } else if (snap_pending) {
1005 ret = btrfs_wait_ordered_extents(root, 0, 1);
1006 BUG_ON(ret);
1007 } 1275 }
1008 1276
1009 /* 1277 /*
@@ -1015,9 +1283,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1015 */ 1283 */
1016 btrfs_run_ordered_operations(root, 1); 1284 btrfs_run_ordered_operations(root, 1);
1017 1285
1286 prepare_to_wait(&cur_trans->writer_wait, &wait,
1287 TASK_UNINTERRUPTIBLE);
1288
1018 smp_mb(); 1289 smp_mb();
1019 if (cur_trans->num_writers > 1 || should_grow) 1290 if (cur_trans->num_writers > 1)
1020 schedule_timeout(timeout); 1291 schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1292 else if (should_grow)
1293 schedule_timeout(1);
1021 1294
1022 mutex_lock(&root->fs_info->trans_mutex); 1295 mutex_lock(&root->fs_info->trans_mutex);
1023 finish_wait(&cur_trans->writer_wait, &wait); 1296 finish_wait(&cur_trans->writer_wait, &wait);
@@ -1100,9 +1373,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1100 1373
1101 btrfs_finish_extent_commit(trans, root); 1374 btrfs_finish_extent_commit(trans, root);
1102 1375
1103 /* do the directory inserts of any pending snapshot creations */
1104 finish_pending_snapshots(trans, root->fs_info);
1105
1106 mutex_lock(&root->fs_info->trans_mutex); 1376 mutex_lock(&root->fs_info->trans_mutex);
1107 1377
1108 cur_trans->commit_done = 1; 1378 cur_trans->commit_done = 1;
@@ -1145,9 +1415,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1145 1415
1146 if (btrfs_header_backref_rev(root->node) < 1416 if (btrfs_header_backref_rev(root->node) <
1147 BTRFS_MIXED_BACKREF_REV) 1417 BTRFS_MIXED_BACKREF_REV)
1148 btrfs_drop_snapshot(root, 0); 1418 btrfs_drop_snapshot(root, NULL, 0);
1149 else 1419 else
1150 btrfs_drop_snapshot(root, 1); 1420 btrfs_drop_snapshot(root, NULL, 1);
1151 } 1421 }
1152 return 0; 1422 return 0;
1153} 1423}
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..229a594cacd5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,24 @@ struct btrfs_transaction {
45 45
46struct btrfs_trans_handle { 46struct btrfs_trans_handle {
47 u64 transid; 47 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved;
48 unsigned long blocks_reserved; 50 unsigned long blocks_reserved;
49 unsigned long blocks_used; 51 unsigned long blocks_used;
50 struct btrfs_transaction *transaction;
51 u64 block_group;
52 u64 alloc_exclude_start;
53 u64 alloc_exclude_nr;
54 unsigned long delayed_ref_updates; 52 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv;
55}; 55};
56 56
57struct btrfs_pending_snapshot { 57struct btrfs_pending_snapshot {
58 struct dentry *dentry; 58 struct dentry *dentry;
59 struct btrfs_root *root; 59 struct btrfs_root *root;
60 char *name; 60 struct btrfs_root *snap;
61 struct btrfs_key root_key; 61 /* block reservation for the operation */
62 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */
64 int error;
65 bool readonly;
62 struct list_head list; 66 struct list_head list;
63}; 67};
64 68
@@ -84,12 +88,17 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
84 88
85int btrfs_end_transaction(struct btrfs_trans_handle *trans, 89int btrfs_end_transaction(struct btrfs_trans_handle *trans,
86 struct btrfs_root *root); 90 struct btrfs_root *root);
91int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
92 struct btrfs_root *root);
87struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 93struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
88 int num_blocks); 94 int num_items);
89struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 95struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
90 int num_blocks); 96 int num_blocks);
97struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root,
98 int num_blocks);
91struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 99struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
92 int num_blocks); 100 int num_blocks);
101int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
93int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 102int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
94 struct btrfs_root *root); 103 struct btrfs_root *root);
95int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, 104int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -101,8 +110,13 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
101int btrfs_clean_old_snapshots(struct btrfs_root *root); 110int btrfs_clean_old_snapshots(struct btrfs_root *root);
102int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 111int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root); 112 struct btrfs_root *root);
113int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
114 struct btrfs_root *root,
115 int wait_for_unblock);
104int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 116int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
105 struct btrfs_root *root); 117 struct btrfs_root *root);
118int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
119 struct btrfs_root *root);
106void btrfs_throttle(struct btrfs_root *root); 120void btrfs_throttle(struct btrfs_root *root);
107int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 121int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 122 struct btrfs_root *root);
@@ -112,5 +126,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
112 struct extent_io_tree *dirty_pages, int mark); 126 struct extent_io_tree *dirty_pages, int mark);
113int btrfs_wait_marked_extents(struct btrfs_root *root, 127int btrfs_wait_marked_extents(struct btrfs_root *root,
114 struct extent_io_tree *dirty_pages, int mark); 128 struct extent_io_tree *dirty_pages, int mark);
129int btrfs_transaction_blocked(struct btrfs_fs_info *info);
115int btrfs_transaction_in_commit(struct btrfs_fs_info *info); 130int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
116#endif 131#endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..992ab425599d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
36 int ret = 0; 36 int ret = 0;
37 int wret; 37 int wret;
38 int level; 38 int level;
39 int orig_level;
40 int is_extent = 0; 39 int is_extent = 0;
41 int next_key_ret = 0; 40 int next_key_ret = 0;
42 u64 last_ret = 0; 41 u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
64 return -ENOMEM; 63 return -ENOMEM;
65 64
66 level = btrfs_header_level(root->node); 65 level = btrfs_header_level(root->node);
67 orig_level = level;
68 66
69 if (level == 0) 67 if (level == 0)
70 goto out; 68 goto out;
@@ -117,13 +115,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
117 path->nodes[1], 0, 115 path->nodes[1], 0,
118 cache_only, &last_ret, 116 cache_only, &last_ret,
119 &root->defrag_progress); 117 &root->defrag_progress);
120 WARN_ON(ret && ret != -EAGAIN); 118 if (ret) {
119 WARN_ON(ret == -EAGAIN);
120 goto out;
121 }
121 if (next_key_ret == 0) { 122 if (next_key_ret == 0) {
122 memcpy(&root->defrag_progress, &key, sizeof(key)); 123 memcpy(&root->defrag_progress, &key, sizeof(key));
123 ret = -EAGAIN; 124 ret = -EAGAIN;
124 } 125 }
125
126 btrfs_release_path(root, path);
127out: 126out:
128 if (path) 127 if (path)
129 btrfs_free_path(path); 128 btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..054744ac5719 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -17,6 +17,7 @@
17 */ 17 */
18 18
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/slab.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "transaction.h" 22#include "transaction.h"
22#include "disk-io.h" 23#include "disk-io.h"
@@ -134,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
134 struct btrfs_root *root) 135 struct btrfs_root *root)
135{ 136{
136 int ret; 137 int ret;
138 int err = 0;
137 139
138 mutex_lock(&root->log_mutex); 140 mutex_lock(&root->log_mutex);
139 if (root->log_root) { 141 if (root->log_root) {
@@ -154,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
154 mutex_lock(&root->fs_info->tree_log_mutex); 156 mutex_lock(&root->fs_info->tree_log_mutex);
155 if (!root->fs_info->log_root_tree) { 157 if (!root->fs_info->log_root_tree) {
156 ret = btrfs_init_log_root_tree(trans, root->fs_info); 158 ret = btrfs_init_log_root_tree(trans, root->fs_info);
157 BUG_ON(ret); 159 if (ret)
160 err = ret;
158 } 161 }
159 if (!root->log_root) { 162 if (err == 0 && !root->log_root) {
160 ret = btrfs_add_log_tree(trans, root); 163 ret = btrfs_add_log_tree(trans, root);
161 BUG_ON(ret); 164 if (ret)
165 err = ret;
162 } 166 }
163 mutex_unlock(&root->fs_info->tree_log_mutex); 167 mutex_unlock(&root->fs_info->tree_log_mutex);
164 root->log_batch++; 168 root->log_batch++;
165 atomic_inc(&root->log_writers); 169 atomic_inc(&root->log_writers);
166 mutex_unlock(&root->log_mutex); 170 mutex_unlock(&root->log_mutex);
167 return 0; 171 return err;
168} 172}
169 173
170/* 174/*
@@ -375,7 +379,7 @@ insert:
375 BUG_ON(ret); 379 BUG_ON(ret);
376 } 380 }
377 } else if (ret) { 381 } else if (ret) {
378 BUG(); 382 return ret;
379 } 383 }
380 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], 384 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
381 path->slots[0]); 385 path->slots[0]);
@@ -445,7 +449,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
445 key.objectid = objectid; 449 key.objectid = objectid;
446 key.type = BTRFS_INODE_ITEM_KEY; 450 key.type = BTRFS_INODE_ITEM_KEY;
447 key.offset = 0; 451 key.offset = 0;
448 inode = btrfs_iget(root->fs_info->sb, &key, root); 452 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
449 if (IS_ERR(inode)) { 453 if (IS_ERR(inode)) {
450 inode = NULL; 454 inode = NULL;
451 } else if (is_bad_inode(inode)) { 455 } else if (is_bad_inode(inode)) {
@@ -782,7 +786,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
782{ 786{
783 struct inode *dir; 787 struct inode *dir;
784 int ret; 788 int ret;
785 struct btrfs_key location;
786 struct btrfs_inode_ref *ref; 789 struct btrfs_inode_ref *ref;
787 struct btrfs_dir_item *di; 790 struct btrfs_dir_item *di;
788 struct inode *inode; 791 struct inode *inode;
@@ -791,10 +794,6 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
791 unsigned long ref_ptr; 794 unsigned long ref_ptr;
792 unsigned long ref_end; 795 unsigned long ref_end;
793 796
794 location.objectid = key->objectid;
795 location.type = BTRFS_INODE_ITEM_KEY;
796 location.offset = 0;
797
798 /* 797 /*
799 * it is possible that we didn't log all the parent directories 798 * it is possible that we didn't log all the parent directories
800 * for a given inode. If we don't find the dir, just don't 799 * for a given inode. If we don't find the dir, just don't
@@ -1579,7 +1578,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1579 struct btrfs_path *path; 1578 struct btrfs_path *path;
1580 struct btrfs_root *root = wc->replay_dest; 1579 struct btrfs_root *root = wc->replay_dest;
1581 struct btrfs_key key; 1580 struct btrfs_key key;
1582 u32 item_size;
1583 int level; 1581 int level;
1584 int i; 1582 int i;
1585 int ret; 1583 int ret;
@@ -1597,7 +1595,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1597 nritems = btrfs_header_nritems(eb); 1595 nritems = btrfs_header_nritems(eb);
1598 for (i = 0; i < nritems; i++) { 1596 for (i = 0; i < nritems; i++) {
1599 btrfs_item_key_to_cpu(eb, &key, i); 1597 btrfs_item_key_to_cpu(eb, &key, i);
1600 item_size = btrfs_item_size_nr(eb, i);
1601 1598
1602 /* inode keys are done during the first stage */ 1599 /* inode keys are done during the first stage */
1603 if (key.type == BTRFS_INODE_ITEM_KEY && 1600 if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1664,7 +1661,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1664 struct walk_control *wc) 1661 struct walk_control *wc)
1665{ 1662{
1666 u64 root_owner; 1663 u64 root_owner;
1667 u64 root_gen;
1668 u64 bytenr; 1664 u64 bytenr;
1669 u64 ptr_gen; 1665 u64 ptr_gen;
1670 struct extent_buffer *next; 1666 struct extent_buffer *next;
@@ -1694,13 +1690,12 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1694 1690
1695 parent = path->nodes[*level]; 1691 parent = path->nodes[*level];
1696 root_owner = btrfs_header_owner(parent); 1692 root_owner = btrfs_header_owner(parent);
1697 root_gen = btrfs_header_generation(parent);
1698 1693
1699 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1694 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1700 1695
1701 wc->process_func(root, next, wc, ptr_gen);
1702
1703 if (*level == 1) { 1696 if (*level == 1) {
1697 wc->process_func(root, next, wc, ptr_gen);
1698
1704 path->slots[*level]++; 1699 path->slots[*level]++;
1705 if (wc->free) { 1700 if (wc->free) {
1706 btrfs_read_buffer(next, ptr_gen); 1701 btrfs_read_buffer(next, ptr_gen);
@@ -1733,35 +1728,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1733 WARN_ON(*level < 0); 1728 WARN_ON(*level < 0);
1734 WARN_ON(*level >= BTRFS_MAX_LEVEL); 1729 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1735 1730
1736 if (path->nodes[*level] == root->node) 1731 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
1737 parent = path->nodes[*level];
1738 else
1739 parent = path->nodes[*level + 1];
1740
1741 bytenr = path->nodes[*level]->start;
1742
1743 blocksize = btrfs_level_size(root, *level);
1744 root_owner = btrfs_header_owner(parent);
1745 root_gen = btrfs_header_generation(parent);
1746
1747 wc->process_func(root, path->nodes[*level], wc,
1748 btrfs_header_generation(path->nodes[*level]));
1749
1750 if (wc->free) {
1751 next = path->nodes[*level];
1752 btrfs_tree_lock(next);
1753 clean_tree_block(trans, root, next);
1754 btrfs_set_lock_blocking(next);
1755 btrfs_wait_tree_block_writeback(next);
1756 btrfs_tree_unlock(next);
1757
1758 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1759 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1760 BUG_ON(ret);
1761 }
1762 free_extent_buffer(path->nodes[*level]);
1763 path->nodes[*level] = NULL;
1764 *level += 1;
1765 1732
1766 cond_resched(); 1733 cond_resched();
1767 return 0; 1734 return 0;
@@ -1773,16 +1740,13 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1773 struct walk_control *wc) 1740 struct walk_control *wc)
1774{ 1741{
1775 u64 root_owner; 1742 u64 root_owner;
1776 u64 root_gen;
1777 int i; 1743 int i;
1778 int slot; 1744 int slot;
1779 int ret; 1745 int ret;
1780 1746
1781 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1747 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1782 slot = path->slots[i]; 1748 slot = path->slots[i];
1783 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { 1749 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1784 struct extent_buffer *node;
1785 node = path->nodes[i];
1786 path->slots[i]++; 1750 path->slots[i]++;
1787 *level = i; 1751 *level = i;
1788 WARN_ON(*level == 0); 1752 WARN_ON(*level == 0);
@@ -1795,7 +1759,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1795 parent = path->nodes[*level + 1]; 1759 parent = path->nodes[*level + 1];
1796 1760
1797 root_owner = btrfs_header_owner(parent); 1761 root_owner = btrfs_header_owner(parent);
1798 root_gen = btrfs_header_generation(parent);
1799 wc->process_func(root, path->nodes[*level], wc, 1762 wc->process_func(root, path->nodes[*level], wc,
1800 btrfs_header_generation(path->nodes[*level])); 1763 btrfs_header_generation(path->nodes[*level]));
1801 if (wc->free) { 1764 if (wc->free) {
@@ -2046,7 +2009,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2046 mutex_unlock(&log_root_tree->log_mutex); 2009 mutex_unlock(&log_root_tree->log_mutex);
2047 2010
2048 ret = update_log_root(trans, log); 2011 ret = update_log_root(trans, log);
2049 BUG_ON(ret);
2050 2012
2051 mutex_lock(&log_root_tree->log_mutex); 2013 mutex_lock(&log_root_tree->log_mutex);
2052 if (atomic_dec_and_test(&log_root_tree->log_writers)) { 2014 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2055,6 +2017,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2055 wake_up(&log_root_tree->log_writer_wait); 2017 wake_up(&log_root_tree->log_writer_wait);
2056 } 2018 }
2057 2019
2020 if (ret) {
2021 BUG_ON(ret != -ENOSPC);
2022 root->fs_info->last_trans_log_full_commit = trans->transid;
2023 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2024 mutex_unlock(&log_root_tree->log_mutex);
2025 ret = -EAGAIN;
2026 goto out;
2027 }
2028
2058 index2 = log_root_tree->log_transid % 2; 2029 index2 = log_root_tree->log_transid % 2;
2059 if (atomic_read(&log_root_tree->log_commit[index2])) { 2030 if (atomic_read(&log_root_tree->log_commit[index2])) {
2060 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2031 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2128,15 +2099,10 @@ out:
2128 return 0; 2099 return 0;
2129} 2100}
2130 2101
2131/* 2102static void free_log_tree(struct btrfs_trans_handle *trans,
2132 * free all the extents used by the tree log. This should be called 2103 struct btrfs_root *log)
2133 * at commit time of the full transaction
2134 */
2135int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2136{ 2104{
2137 int ret; 2105 int ret;
2138 struct btrfs_root *log;
2139 struct key;
2140 u64 start; 2106 u64 start;
2141 u64 end; 2107 u64 end;
2142 struct walk_control wc = { 2108 struct walk_control wc = {
@@ -2144,10 +2110,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2144 .process_func = process_one_buffer 2110 .process_func = process_one_buffer
2145 }; 2111 };
2146 2112
2147 if (!root->log_root || root->fs_info->log_root_recovering)
2148 return 0;
2149
2150 log = root->log_root;
2151 ret = walk_log_tree(trans, log, &wc); 2113 ret = walk_log_tree(trans, log, &wc);
2152 BUG_ON(ret); 2114 BUG_ON(ret);
2153 2115
@@ -2161,14 +2123,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2161 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); 2123 EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
2162 } 2124 }
2163 2125
2164 if (log->log_transid > 0) {
2165 ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2166 &log->root_key);
2167 BUG_ON(ret);
2168 }
2169 root->log_root = NULL;
2170 free_extent_buffer(log->node); 2126 free_extent_buffer(log->node);
2171 kfree(log); 2127 kfree(log);
2128}
2129
2130/*
2131 * free all the extents used by the tree log. This should be called
2132 * at commit time of the full transaction
2133 */
2134int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2135{
2136 if (root->log_root) {
2137 free_log_tree(trans, root->log_root);
2138 root->log_root = NULL;
2139 }
2140 return 0;
2141}
2142
2143int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
2144 struct btrfs_fs_info *fs_info)
2145{
2146 if (fs_info->log_root_tree) {
2147 free_log_tree(trans, fs_info->log_root_tree);
2148 fs_info->log_root_tree = NULL;
2149 }
2172 return 0; 2150 return 0;
2173} 2151}
2174 2152
@@ -2202,6 +2180,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2202 struct btrfs_dir_item *di; 2180 struct btrfs_dir_item *di;
2203 struct btrfs_path *path; 2181 struct btrfs_path *path;
2204 int ret; 2182 int ret;
2183 int err = 0;
2205 int bytes_del = 0; 2184 int bytes_del = 0;
2206 2185
2207 if (BTRFS_I(dir)->logged_trans < trans->transid) 2186 if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2217,7 +2196,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2217 path = btrfs_alloc_path(); 2196 path = btrfs_alloc_path();
2218 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2197 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2219 name, name_len, -1); 2198 name, name_len, -1);
2220 if (di && !IS_ERR(di)) { 2199 if (IS_ERR(di)) {
2200 err = PTR_ERR(di);
2201 goto fail;
2202 }
2203 if (di) {
2221 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2204 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2222 bytes_del += name_len; 2205 bytes_del += name_len;
2223 BUG_ON(ret); 2206 BUG_ON(ret);
@@ -2225,7 +2208,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2225 btrfs_release_path(log, path); 2208 btrfs_release_path(log, path);
2226 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2209 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2227 index, name, name_len, -1); 2210 index, name, name_len, -1);
2228 if (di && !IS_ERR(di)) { 2211 if (IS_ERR(di)) {
2212 err = PTR_ERR(di);
2213 goto fail;
2214 }
2215 if (di) {
2229 ret = btrfs_delete_one_dir_name(trans, log, path, di); 2216 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2230 bytes_del += name_len; 2217 bytes_del += name_len;
2231 BUG_ON(ret); 2218 BUG_ON(ret);
@@ -2243,6 +2230,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2243 btrfs_release_path(log, path); 2230 btrfs_release_path(log, path);
2244 2231
2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2232 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2233 if (ret < 0) {
2234 err = ret;
2235 goto fail;
2236 }
2246 if (ret == 0) { 2237 if (ret == 0) {
2247 struct btrfs_inode_item *item; 2238 struct btrfs_inode_item *item;
2248 u64 i_size; 2239 u64 i_size;
@@ -2260,12 +2251,16 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2260 ret = 0; 2251 ret = 0;
2261 btrfs_release_path(log, path); 2252 btrfs_release_path(log, path);
2262 } 2253 }
2263 2254fail:
2264 btrfs_free_path(path); 2255 btrfs_free_path(path);
2265 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2256 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2257 if (ret == -ENOSPC) {
2258 root->fs_info->last_trans_log_full_commit = trans->transid;
2259 ret = 0;
2260 }
2266 btrfs_end_log_trans(root); 2261 btrfs_end_log_trans(root);
2267 2262
2268 return 0; 2263 return err;
2269} 2264}
2270 2265
2271/* see comments for btrfs_del_dir_entries_in_log */ 2266/* see comments for btrfs_del_dir_entries_in_log */
@@ -2290,6 +2285,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2290 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2285 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2291 dirid, &index); 2286 dirid, &index);
2292 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2287 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2288 if (ret == -ENOSPC) {
2289 root->fs_info->last_trans_log_full_commit = trans->transid;
2290 ret = 0;
2291 }
2293 btrfs_end_log_trans(root); 2292 btrfs_end_log_trans(root);
2294 2293
2295 return ret; 2294 return ret;
@@ -2317,7 +2316,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2317 else 2316 else
2318 key.type = BTRFS_DIR_LOG_INDEX_KEY; 2317 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2319 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); 2318 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2320 BUG_ON(ret); 2319 if (ret)
2320 return ret;
2321 2321
2322 item = btrfs_item_ptr(path->nodes[0], path->slots[0], 2322 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2323 struct btrfs_dir_log_item); 2323 struct btrfs_dir_log_item);
@@ -2342,6 +2342,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2342 struct btrfs_key max_key; 2342 struct btrfs_key max_key;
2343 struct btrfs_root *log = root->log_root; 2343 struct btrfs_root *log = root->log_root;
2344 struct extent_buffer *src; 2344 struct extent_buffer *src;
2345 int err = 0;
2345 int ret; 2346 int ret;
2346 int i; 2347 int i;
2347 int nritems; 2348 int nritems;
@@ -2404,6 +2405,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2404 ret = overwrite_item(trans, log, dst_path, 2405 ret = overwrite_item(trans, log, dst_path,
2405 path->nodes[0], path->slots[0], 2406 path->nodes[0], path->slots[0],
2406 &tmp); 2407 &tmp);
2408 if (ret) {
2409 err = ret;
2410 goto done;
2411 }
2407 } 2412 }
2408 } 2413 }
2409 btrfs_release_path(root, path); 2414 btrfs_release_path(root, path);
@@ -2431,7 +2436,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2431 goto done; 2436 goto done;
2432 ret = overwrite_item(trans, log, dst_path, src, i, 2437 ret = overwrite_item(trans, log, dst_path, src, i,
2433 &min_key); 2438 &min_key);
2434 BUG_ON(ret); 2439 if (ret) {
2440 err = ret;
2441 goto done;
2442 }
2435 } 2443 }
2436 path->slots[0] = nritems; 2444 path->slots[0] = nritems;
2437 2445
@@ -2453,22 +2461,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2453 ret = overwrite_item(trans, log, dst_path, 2461 ret = overwrite_item(trans, log, dst_path,
2454 path->nodes[0], path->slots[0], 2462 path->nodes[0], path->slots[0],
2455 &tmp); 2463 &tmp);
2456 2464 if (ret)
2457 BUG_ON(ret); 2465 err = ret;
2458 last_offset = tmp.offset; 2466 else
2467 last_offset = tmp.offset;
2459 goto done; 2468 goto done;
2460 } 2469 }
2461 } 2470 }
2462done: 2471done:
2463 *last_offset_ret = last_offset;
2464 btrfs_release_path(root, path); 2472 btrfs_release_path(root, path);
2465 btrfs_release_path(log, dst_path); 2473 btrfs_release_path(log, dst_path);
2466 2474
2467 /* insert the log range keys to indicate where the log is valid */ 2475 if (err == 0) {
2468 ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, 2476 *last_offset_ret = last_offset;
2469 first_offset, last_offset); 2477 /*
2470 BUG_ON(ret); 2478 * insert the log range keys to indicate where the log
2471 return 0; 2479 * is valid
2480 */
2481 ret = insert_dir_log_key(trans, log, path, key_type,
2482 inode->i_ino, first_offset,
2483 last_offset);
2484 if (ret)
2485 err = ret;
2486 }
2487 return err;
2472} 2488}
2473 2489
2474/* 2490/*
@@ -2500,7 +2516,8 @@ again:
2500 ret = log_dir_items(trans, root, inode, path, 2516 ret = log_dir_items(trans, root, inode, path,
2501 dst_path, key_type, min_key, 2517 dst_path, key_type, min_key,
2502 &max_key); 2518 &max_key);
2503 BUG_ON(ret); 2519 if (ret)
2520 return ret;
2504 if (max_key == (u64)-1) 2521 if (max_key == (u64)-1)
2505 break; 2522 break;
2506 min_key = max_key + 1; 2523 min_key = max_key + 1;
@@ -2534,8 +2551,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2534 2551
2535 while (1) { 2552 while (1) {
2536 ret = btrfs_search_slot(trans, log, &key, path, -1, 1); 2553 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2537 2554 BUG_ON(ret == 0);
2538 if (ret != 1) 2555 if (ret < 0)
2539 break; 2556 break;
2540 2557
2541 if (path->slots[0] == 0) 2558 if (path->slots[0] == 0)
@@ -2553,7 +2570,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2553 btrfs_release_path(log, path); 2570 btrfs_release_path(log, path);
2554 } 2571 }
2555 btrfs_release_path(log, path); 2572 btrfs_release_path(log, path);
2556 return 0; 2573 return ret;
2557} 2574}
2558 2575
2559static noinline int copy_items(struct btrfs_trans_handle *trans, 2576static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2586,7 +2603,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2586 } 2603 }
2587 ret = btrfs_insert_empty_items(trans, log, dst_path, 2604 ret = btrfs_insert_empty_items(trans, log, dst_path,
2588 ins_keys, ins_sizes, nr); 2605 ins_keys, ins_sizes, nr);
2589 BUG_ON(ret); 2606 if (ret) {
2607 kfree(ins_data);
2608 return ret;
2609 }
2590 2610
2591 for (i = 0; i < nr; i++, dst_path->slots[0]++) { 2611 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
2592 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], 2612 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2659,16 +2679,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2659 * we have to do this after the loop above to avoid changing the 2679 * we have to do this after the loop above to avoid changing the
2660 * log tree while trying to change the log tree. 2680 * log tree while trying to change the log tree.
2661 */ 2681 */
2682 ret = 0;
2662 while (!list_empty(&ordered_sums)) { 2683 while (!list_empty(&ordered_sums)) {
2663 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, 2684 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
2664 struct btrfs_ordered_sum, 2685 struct btrfs_ordered_sum,
2665 list); 2686 list);
2666 ret = btrfs_csum_file_blocks(trans, log, sums); 2687 if (!ret)
2667 BUG_ON(ret); 2688 ret = btrfs_csum_file_blocks(trans, log, sums);
2668 list_del(&sums->list); 2689 list_del(&sums->list);
2669 kfree(sums); 2690 kfree(sums);
2670 } 2691 }
2671 return 0; 2692 return ret;
2672} 2693}
2673 2694
2674/* log a single inode in the tree log. 2695/* log a single inode in the tree log.
@@ -2695,7 +2716,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2695 struct btrfs_key max_key; 2716 struct btrfs_key max_key;
2696 struct btrfs_root *log = root->log_root; 2717 struct btrfs_root *log = root->log_root;
2697 struct extent_buffer *src = NULL; 2718 struct extent_buffer *src = NULL;
2698 u32 size; 2719 int err = 0;
2699 int ret; 2720 int ret;
2700 int nritems; 2721 int nritems;
2701 int ins_start_slot = 0; 2722 int ins_start_slot = 0;
@@ -2738,7 +2759,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2738 } else { 2759 } else {
2739 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2760 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2740 } 2761 }
2741 BUG_ON(ret); 2762 if (ret) {
2763 err = ret;
2764 goto out_unlock;
2765 }
2742 path->keep_locks = 1; 2766 path->keep_locks = 1;
2743 2767
2744 while (1) { 2768 while (1) {
@@ -2755,7 +2779,6 @@ again:
2755 break; 2779 break;
2756 2780
2757 src = path->nodes[0]; 2781 src = path->nodes[0];
2758 size = btrfs_item_size_nr(src, path->slots[0]);
2759 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2782 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2760 ins_nr++; 2783 ins_nr++;
2761 goto next_slot; 2784 goto next_slot;
@@ -2767,7 +2790,10 @@ again:
2767 2790
2768 ret = copy_items(trans, log, dst_path, src, ins_start_slot, 2791 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2769 ins_nr, inode_only); 2792 ins_nr, inode_only);
2770 BUG_ON(ret); 2793 if (ret) {
2794 err = ret;
2795 goto out_unlock;
2796 }
2771 ins_nr = 1; 2797 ins_nr = 1;
2772 ins_start_slot = path->slots[0]; 2798 ins_start_slot = path->slots[0];
2773next_slot: 2799next_slot:
@@ -2783,7 +2809,10 @@ next_slot:
2783 ret = copy_items(trans, log, dst_path, src, 2809 ret = copy_items(trans, log, dst_path, src,
2784 ins_start_slot, 2810 ins_start_slot,
2785 ins_nr, inode_only); 2811 ins_nr, inode_only);
2786 BUG_ON(ret); 2812 if (ret) {
2813 err = ret;
2814 goto out_unlock;
2815 }
2787 ins_nr = 0; 2816 ins_nr = 0;
2788 } 2817 }
2789 btrfs_release_path(root, path); 2818 btrfs_release_path(root, path);
@@ -2801,7 +2830,10 @@ next_slot:
2801 ret = copy_items(trans, log, dst_path, src, 2830 ret = copy_items(trans, log, dst_path, src,
2802 ins_start_slot, 2831 ins_start_slot,
2803 ins_nr, inode_only); 2832 ins_nr, inode_only);
2804 BUG_ON(ret); 2833 if (ret) {
2834 err = ret;
2835 goto out_unlock;
2836 }
2805 ins_nr = 0; 2837 ins_nr = 0;
2806 } 2838 }
2807 WARN_ON(ins_nr); 2839 WARN_ON(ins_nr);
@@ -2809,14 +2841,18 @@ next_slot:
2809 btrfs_release_path(root, path); 2841 btrfs_release_path(root, path);
2810 btrfs_release_path(log, dst_path); 2842 btrfs_release_path(log, dst_path);
2811 ret = log_directory_changes(trans, root, inode, path, dst_path); 2843 ret = log_directory_changes(trans, root, inode, path, dst_path);
2812 BUG_ON(ret); 2844 if (ret) {
2845 err = ret;
2846 goto out_unlock;
2847 }
2813 } 2848 }
2814 BTRFS_I(inode)->logged_trans = trans->transid; 2849 BTRFS_I(inode)->logged_trans = trans->transid;
2850out_unlock:
2815 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2851 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2816 2852
2817 btrfs_free_path(path); 2853 btrfs_free_path(path);
2818 btrfs_free_path(dst_path); 2854 btrfs_free_path(dst_path);
2819 return 0; 2855 return err;
2820} 2856}
2821 2857
2822/* 2858/*
@@ -2833,6 +2869,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2833{ 2869{
2834 int ret = 0; 2870 int ret = 0;
2835 struct btrfs_root *root; 2871 struct btrfs_root *root;
2872 struct dentry *old_parent = NULL;
2836 2873
2837 /* 2874 /*
2838 * for regular files, if its inode is already on disk, we don't 2875 * for regular files, if its inode is already on disk, we don't
@@ -2874,10 +2911,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2874 if (IS_ROOT(parent)) 2911 if (IS_ROOT(parent))
2875 break; 2912 break;
2876 2913
2877 parent = parent->d_parent; 2914 parent = dget_parent(parent);
2915 dput(old_parent);
2916 old_parent = parent;
2878 inode = parent->d_inode; 2917 inode = parent->d_inode;
2879 2918
2880 } 2919 }
2920 dput(old_parent);
2881out: 2921out:
2882 return ret; 2922 return ret;
2883} 2923}
@@ -2909,6 +2949,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2909{ 2949{
2910 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 2950 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2911 struct super_block *sb; 2951 struct super_block *sb;
2952 struct dentry *old_parent = NULL;
2912 int ret = 0; 2953 int ret = 0;
2913 u64 last_committed = root->fs_info->last_trans_committed; 2954 u64 last_committed = root->fs_info->last_trans_committed;
2914 2955
@@ -2941,10 +2982,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2941 goto end_no_trans; 2982 goto end_no_trans;
2942 } 2983 }
2943 2984
2944 start_log_trans(trans, root); 2985 ret = start_log_trans(trans, root);
2986 if (ret)
2987 goto end_trans;
2945 2988
2946 ret = btrfs_log_inode(trans, root, inode, inode_only); 2989 ret = btrfs_log_inode(trans, root, inode, inode_only);
2947 BUG_ON(ret); 2990 if (ret)
2991 goto end_trans;
2948 2992
2949 /* 2993 /*
2950 * for regular files, if its inode is already on disk, we don't 2994 * for regular files, if its inode is already on disk, we don't
@@ -2954,8 +2998,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2954 */ 2998 */
2955 if (S_ISREG(inode->i_mode) && 2999 if (S_ISREG(inode->i_mode) &&
2956 BTRFS_I(inode)->generation <= last_committed && 3000 BTRFS_I(inode)->generation <= last_committed &&
2957 BTRFS_I(inode)->last_unlink_trans <= last_committed) 3001 BTRFS_I(inode)->last_unlink_trans <= last_committed) {
2958 goto no_parent; 3002 ret = 0;
3003 goto end_trans;
3004 }
2959 3005
2960 inode_only = LOG_INODE_EXISTS; 3006 inode_only = LOG_INODE_EXISTS;
2961 while (1) { 3007 while (1) {
@@ -2969,15 +3015,24 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2969 if (BTRFS_I(inode)->generation > 3015 if (BTRFS_I(inode)->generation >
2970 root->fs_info->last_trans_committed) { 3016 root->fs_info->last_trans_committed) {
2971 ret = btrfs_log_inode(trans, root, inode, inode_only); 3017 ret = btrfs_log_inode(trans, root, inode, inode_only);
2972 BUG_ON(ret); 3018 if (ret)
3019 goto end_trans;
2973 } 3020 }
2974 if (IS_ROOT(parent)) 3021 if (IS_ROOT(parent))
2975 break; 3022 break;
2976 3023
2977 parent = parent->d_parent; 3024 parent = dget_parent(parent);
3025 dput(old_parent);
3026 old_parent = parent;
2978 } 3027 }
2979no_parent:
2980 ret = 0; 3028 ret = 0;
3029end_trans:
3030 dput(old_parent);
3031 if (ret < 0) {
3032 BUG_ON(ret != -ENOSPC);
3033 root->fs_info->last_trans_log_full_commit = trans->transid;
3034 ret = 1;
3035 }
2981 btrfs_end_log_trans(root); 3036 btrfs_end_log_trans(root);
2982end_no_trans: 3037end_no_trans:
2983 return ret; 3038 return ret;
@@ -2992,8 +3047,13 @@ end_no_trans:
2992int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3047int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2993 struct btrfs_root *root, struct dentry *dentry) 3048 struct btrfs_root *root, struct dentry *dentry)
2994{ 3049{
2995 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 3050 struct dentry *parent = dget_parent(dentry);
2996 dentry->d_parent, 0); 3051 int ret;
3052
3053 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3054 dput(parent);
3055
3056 return ret;
2997} 3057}
2998 3058
2999/* 3059/*
@@ -3019,7 +3079,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3019 path = btrfs_alloc_path(); 3079 path = btrfs_alloc_path();
3020 BUG_ON(!path); 3080 BUG_ON(!path);
3021 3081
3022 trans = btrfs_start_transaction(fs_info->tree_root, 1); 3082 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3023 3083
3024 wc.trans = trans; 3084 wc.trans = trans;
3025 wc.pin = 1; 3085 wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
25int btrfs_sync_log(struct btrfs_trans_handle *trans, 25int btrfs_sync_log(struct btrfs_trans_handle *trans,
26 struct btrfs_root *root); 26 struct btrfs_root *root);
27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); 27int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
28int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
29 struct btrfs_fs_info *fs_info);
28int btrfs_recover_log_trees(struct btrfs_root *tree_root); 30int btrfs_recover_log_trees(struct btrfs_root *tree_root);
29int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 31int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
30 struct btrfs_root *root, struct dentry *dentry); 32 struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..d158530233b7 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -17,10 +17,12 @@
17 */ 17 */
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/bio.h> 19#include <linux/bio.h>
20#include <linux/slab.h>
20#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
21#include <linux/blkdev.h> 22#include <linux/blkdev.h>
22#include <linux/random.h> 23#include <linux/random.h>
23#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
24#include <asm/div64.h> 26#include <asm/div64.h>
25#include "compat.h" 27#include "compat.h"
26#include "ctree.h" 28#include "ctree.h"
@@ -256,13 +258,13 @@ loop_lock:
256 wake_up(&fs_info->async_submit_wait); 258 wake_up(&fs_info->async_submit_wait);
257 259
258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 260 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
259 submit_bio(cur->bi_rw, cur);
260 num_run++;
261 batch_run++;
262 261
263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 262 if (cur->bi_rw & REQ_SYNC)
264 num_sync_run++; 263 num_sync_run++;
265 264
265 submit_bio(cur->bi_rw, cur);
266 num_run++;
267 batch_run++;
266 if (need_resched()) { 268 if (need_resched()) {
267 if (num_sync_run) { 269 if (num_sync_run) {
268 blk_run_backing_dev(bdi, NULL); 270 blk_run_backing_dev(bdi, NULL);
@@ -325,16 +327,6 @@ loop_lock:
325 num_sync_run = 0; 327 num_sync_run = 0;
326 blk_run_backing_dev(bdi, NULL); 328 blk_run_backing_dev(bdi, NULL);
327 } 329 }
328
329 cond_resched();
330 if (again)
331 goto loop;
332
333 spin_lock(&device->io_lock);
334 if (device->pending_bios.head || device->pending_sync_bios.head)
335 goto loop_lock;
336 spin_unlock(&device->io_lock);
337
338 /* 330 /*
339 * IO has already been through a long path to get here. Checksumming, 331 * IO has already been through a long path to get here. Checksumming,
340 * async helper threads, perhaps compression. We've done a pretty 332 * async helper threads, perhaps compression. We've done a pretty
@@ -346,6 +338,16 @@ loop_lock:
346 * cared about found its way down here. 338 * cared about found its way down here.
347 */ 339 */
348 blk_run_backing_dev(bdi, NULL); 340 blk_run_backing_dev(bdi, NULL);
341
342 cond_resched();
343 if (again)
344 goto loop;
345
346 spin_lock(&device->io_lock);
347 if (device->pending_bios.head || device->pending_sync_bios.head)
348 goto loop_lock;
349 spin_unlock(&device->io_lock);
350
349done: 351done:
350 return 0; 352 return 0;
351} 353}
@@ -365,6 +367,7 @@ static noinline int device_list_add(const char *path,
365 struct btrfs_device *device; 367 struct btrfs_device *device;
366 struct btrfs_fs_devices *fs_devices; 368 struct btrfs_fs_devices *fs_devices;
367 u64 found_transid = btrfs_super_generation(disk_super); 369 u64 found_transid = btrfs_super_generation(disk_super);
370 char *name;
368 371
369 fs_devices = find_fsid(disk_super->fsid); 372 fs_devices = find_fsid(disk_super->fsid);
370 if (!fs_devices) { 373 if (!fs_devices) {
@@ -396,7 +399,6 @@ static noinline int device_list_add(const char *path,
396 device->work.func = pending_bios_fn; 399 device->work.func = pending_bios_fn;
397 memcpy(device->uuid, disk_super->dev_item.uuid, 400 memcpy(device->uuid, disk_super->dev_item.uuid,
398 BTRFS_UUID_SIZE); 401 BTRFS_UUID_SIZE);
399 device->barriers = 1;
400 spin_lock_init(&device->io_lock); 402 spin_lock_init(&device->io_lock);
401 device->name = kstrdup(path, GFP_NOFS); 403 device->name = kstrdup(path, GFP_NOFS);
402 if (!device->name) { 404 if (!device->name) {
@@ -411,6 +413,16 @@ static noinline int device_list_add(const char *path,
411 413
412 device->fs_devices = fs_devices; 414 device->fs_devices = fs_devices;
413 fs_devices->num_devices++; 415 fs_devices->num_devices++;
416 } else if (!device->name || strcmp(device->name, path)) {
417 name = kstrdup(path, GFP_NOFS);
418 if (!name)
419 return -ENOMEM;
420 kfree(device->name);
421 device->name = name;
422 if (device->missing) {
423 fs_devices->missing_devices--;
424 device->missing = 0;
425 }
414 } 426 }
415 427
416 if (found_transid > fs_devices->latest_trans) { 428 if (found_transid > fs_devices->latest_trans) {
@@ -454,7 +466,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
454 device->devid = orig_dev->devid; 466 device->devid = orig_dev->devid;
455 device->work.func = pending_bios_fn; 467 device->work.func = pending_bios_fn;
456 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 468 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
457 device->barriers = 1;
458 spin_lock_init(&device->io_lock); 469 spin_lock_init(&device->io_lock);
459 INIT_LIST_HEAD(&device->dev_list); 470 INIT_LIST_HEAD(&device->dev_list);
460 INIT_LIST_HEAD(&device->dev_alloc_list); 471 INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -483,7 +494,7 @@ again:
483 continue; 494 continue;
484 495
485 if (device->bdev) { 496 if (device->bdev) {
486 close_bdev_exclusive(device->bdev, device->mode); 497 blkdev_put(device->bdev, device->mode);
487 device->bdev = NULL; 498 device->bdev = NULL;
488 fs_devices->open_devices--; 499 fs_devices->open_devices--;
489 } 500 }
@@ -517,7 +528,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
517 528
518 list_for_each_entry(device, &fs_devices->devices, dev_list) { 529 list_for_each_entry(device, &fs_devices->devices, dev_list) {
519 if (device->bdev) { 530 if (device->bdev) {
520 close_bdev_exclusive(device->bdev, device->mode); 531 blkdev_put(device->bdev, device->mode);
521 fs_devices->open_devices--; 532 fs_devices->open_devices--;
522 } 533 }
523 if (device->writeable) { 534 if (device->writeable) {
@@ -574,13 +585,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
574 int seeding = 1; 585 int seeding = 1;
575 int ret = 0; 586 int ret = 0;
576 587
588 flags |= FMODE_EXCL;
589
577 list_for_each_entry(device, head, dev_list) { 590 list_for_each_entry(device, head, dev_list) {
578 if (device->bdev) 591 if (device->bdev)
579 continue; 592 continue;
580 if (!device->name) 593 if (!device->name)
581 continue; 594 continue;
582 595
583 bdev = open_bdev_exclusive(device->name, flags, holder); 596 bdev = blkdev_get_by_path(device->name, flags, holder);
584 if (IS_ERR(bdev)) { 597 if (IS_ERR(bdev)) {
585 printk(KERN_INFO "open %s failed\n", device->name); 598 printk(KERN_INFO "open %s failed\n", device->name);
586 goto error; 599 goto error;
@@ -588,11 +601,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
588 set_blocksize(bdev, 4096); 601 set_blocksize(bdev, 4096);
589 602
590 bh = btrfs_read_dev_super(bdev); 603 bh = btrfs_read_dev_super(bdev);
591 if (!bh) 604 if (!bh) {
605 ret = -EINVAL;
592 goto error_close; 606 goto error_close;
607 }
593 608
594 disk_super = (struct btrfs_super_block *)bh->b_data; 609 disk_super = (struct btrfs_super_block *)bh->b_data;
595 devid = le64_to_cpu(disk_super->dev_item.devid); 610 devid = btrfs_stack_device_id(&disk_super->dev_item);
596 if (devid != device->devid) 611 if (devid != device->devid)
597 goto error_brelse; 612 goto error_brelse;
598 613
@@ -632,7 +647,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
632error_brelse: 647error_brelse:
633 brelse(bh); 648 brelse(bh);
634error_close: 649error_close:
635 close_bdev_exclusive(bdev, FMODE_READ); 650 blkdev_put(bdev, flags);
636error: 651error:
637 continue; 652 continue;
638 } 653 }
@@ -678,7 +693,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
678 693
679 mutex_lock(&uuid_mutex); 694 mutex_lock(&uuid_mutex);
680 695
681 bdev = open_bdev_exclusive(path, flags, holder); 696 flags |= FMODE_EXCL;
697 bdev = blkdev_get_by_path(path, flags, holder);
682 698
683 if (IS_ERR(bdev)) { 699 if (IS_ERR(bdev)) {
684 ret = PTR_ERR(bdev); 700 ret = PTR_ERR(bdev);
@@ -690,11 +706,11 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
690 goto error_close; 706 goto error_close;
691 bh = btrfs_read_dev_super(bdev); 707 bh = btrfs_read_dev_super(bdev);
692 if (!bh) { 708 if (!bh) {
693 ret = -EIO; 709 ret = -EINVAL;
694 goto error_close; 710 goto error_close;
695 } 711 }
696 disk_super = (struct btrfs_super_block *)bh->b_data; 712 disk_super = (struct btrfs_super_block *)bh->b_data;
697 devid = le64_to_cpu(disk_super->dev_item.devid); 713 devid = btrfs_stack_device_id(&disk_super->dev_item);
698 transid = btrfs_super_generation(disk_super); 714 transid = btrfs_super_generation(disk_super);
699 if (disk_super->label[0]) 715 if (disk_super->label[0])
700 printk(KERN_INFO "device label %s ", disk_super->label); 716 printk(KERN_INFO "device label %s ", disk_super->label);
@@ -710,65 +726,173 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
710 726
711 brelse(bh); 727 brelse(bh);
712error_close: 728error_close:
713 close_bdev_exclusive(bdev, flags); 729 blkdev_put(bdev, flags);
714error: 730error:
715 mutex_unlock(&uuid_mutex); 731 mutex_unlock(&uuid_mutex);
716 return ret; 732 return ret;
717} 733}
718 734
735/* helper to account the used device space in the range */
736int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
737 u64 end, u64 *length)
738{
739 struct btrfs_key key;
740 struct btrfs_root *root = device->dev_root;
741 struct btrfs_dev_extent *dev_extent;
742 struct btrfs_path *path;
743 u64 extent_end;
744 int ret;
745 int slot;
746 struct extent_buffer *l;
747
748 *length = 0;
749
750 if (start >= device->total_bytes)
751 return 0;
752
753 path = btrfs_alloc_path();
754 if (!path)
755 return -ENOMEM;
756 path->reada = 2;
757
758 key.objectid = device->devid;
759 key.offset = start;
760 key.type = BTRFS_DEV_EXTENT_KEY;
761
762 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
763 if (ret < 0)
764 goto out;
765 if (ret > 0) {
766 ret = btrfs_previous_item(root, path, key.objectid, key.type);
767 if (ret < 0)
768 goto out;
769 }
770
771 while (1) {
772 l = path->nodes[0];
773 slot = path->slots[0];
774 if (slot >= btrfs_header_nritems(l)) {
775 ret = btrfs_next_leaf(root, path);
776 if (ret == 0)
777 continue;
778 if (ret < 0)
779 goto out;
780
781 break;
782 }
783 btrfs_item_key_to_cpu(l, &key, slot);
784
785 if (key.objectid < device->devid)
786 goto next;
787
788 if (key.objectid > device->devid)
789 break;
790
791 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
792 goto next;
793
794 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
795 extent_end = key.offset + btrfs_dev_extent_length(l,
796 dev_extent);
797 if (key.offset <= start && extent_end > end) {
798 *length = end - start + 1;
799 break;
800 } else if (key.offset <= start && extent_end > start)
801 *length += extent_end - start;
802 else if (key.offset > start && extent_end <= end)
803 *length += extent_end - key.offset;
804 else if (key.offset > start && key.offset <= end) {
805 *length += end - key.offset + 1;
806 break;
807 } else if (key.offset > end)
808 break;
809
810next:
811 path->slots[0]++;
812 }
813 ret = 0;
814out:
815 btrfs_free_path(path);
816 return ret;
817}
818
719/* 819/*
820 * find_free_dev_extent - find free space in the specified device
821 * @trans: transaction handler
822 * @device: the device which we search the free space in
823 * @num_bytes: the size of the free space that we need
824 * @start: store the start of the free space.
825 * @len: the size of the free space. that we find, or the size of the max
826 * free space if we don't find suitable free space
827 *
720 * this uses a pretty simple search, the expectation is that it is 828 * this uses a pretty simple search, the expectation is that it is
721 * called very infrequently and that a given device has a small number 829 * called very infrequently and that a given device has a small number
722 * of extents 830 * of extents
831 *
832 * @start is used to store the start of the free space if we find. But if we
833 * don't find suitable free space, it will be used to store the start position
834 * of the max free space.
835 *
836 * @len is used to store the size of the free space that we find.
837 * But if we don't find suitable free space, it is used to store the size of
838 * the max free space.
723 */ 839 */
724int find_free_dev_extent(struct btrfs_trans_handle *trans, 840int find_free_dev_extent(struct btrfs_trans_handle *trans,
725 struct btrfs_device *device, u64 num_bytes, 841 struct btrfs_device *device, u64 num_bytes,
726 u64 *start, u64 *max_avail) 842 u64 *start, u64 *len)
727{ 843{
728 struct btrfs_key key; 844 struct btrfs_key key;
729 struct btrfs_root *root = device->dev_root; 845 struct btrfs_root *root = device->dev_root;
730 struct btrfs_dev_extent *dev_extent = NULL; 846 struct btrfs_dev_extent *dev_extent;
731 struct btrfs_path *path; 847 struct btrfs_path *path;
732 u64 hole_size = 0; 848 u64 hole_size;
733 u64 last_byte = 0; 849 u64 max_hole_start;
734 u64 search_start = 0; 850 u64 max_hole_size;
851 u64 extent_end;
852 u64 search_start;
735 u64 search_end = device->total_bytes; 853 u64 search_end = device->total_bytes;
736 int ret; 854 int ret;
737 int slot = 0; 855 int slot;
738 int start_found;
739 struct extent_buffer *l; 856 struct extent_buffer *l;
740 857
741 path = btrfs_alloc_path();
742 if (!path)
743 return -ENOMEM;
744 path->reada = 2;
745 start_found = 0;
746
747 /* FIXME use last free of some kind */ 858 /* FIXME use last free of some kind */
748 859
749 /* we don't want to overwrite the superblock on the drive, 860 /* we don't want to overwrite the superblock on the drive,
750 * so we make sure to start at an offset of at least 1MB 861 * so we make sure to start at an offset of at least 1MB
751 */ 862 */
752 search_start = max((u64)1024 * 1024, search_start); 863 search_start = 1024 * 1024;
753 864
754 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 865 if (root->fs_info->alloc_start + num_bytes <= search_end)
755 search_start = max(root->fs_info->alloc_start, search_start); 866 search_start = max(root->fs_info->alloc_start, search_start);
756 867
868 max_hole_start = search_start;
869 max_hole_size = 0;
870
871 if (search_start >= search_end) {
872 ret = -ENOSPC;
873 goto error;
874 }
875
876 path = btrfs_alloc_path();
877 if (!path) {
878 ret = -ENOMEM;
879 goto error;
880 }
881 path->reada = 2;
882
757 key.objectid = device->devid; 883 key.objectid = device->devid;
758 key.offset = search_start; 884 key.offset = search_start;
759 key.type = BTRFS_DEV_EXTENT_KEY; 885 key.type = BTRFS_DEV_EXTENT_KEY;
886
760 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 887 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
761 if (ret < 0) 888 if (ret < 0)
762 goto error; 889 goto out;
763 if (ret > 0) { 890 if (ret > 0) {
764 ret = btrfs_previous_item(root, path, key.objectid, key.type); 891 ret = btrfs_previous_item(root, path, key.objectid, key.type);
765 if (ret < 0) 892 if (ret < 0)
766 goto error; 893 goto out;
767 if (ret > 0)
768 start_found = 1;
769 } 894 }
770 l = path->nodes[0]; 895
771 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
772 while (1) { 896 while (1) {
773 l = path->nodes[0]; 897 l = path->nodes[0];
774 slot = path->slots[0]; 898 slot = path->slots[0];
@@ -777,24 +901,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
777 if (ret == 0) 901 if (ret == 0)
778 continue; 902 continue;
779 if (ret < 0) 903 if (ret < 0)
780 goto error; 904 goto out;
781no_more_items: 905
782 if (!start_found) { 906 break;
783 if (search_start >= search_end) {
784 ret = -ENOSPC;
785 goto error;
786 }
787 *start = search_start;
788 start_found = 1;
789 goto check_pending;
790 }
791 *start = last_byte > search_start ?
792 last_byte : search_start;
793 if (search_end <= *start) {
794 ret = -ENOSPC;
795 goto error;
796 }
797 goto check_pending;
798 } 907 }
799 btrfs_item_key_to_cpu(l, &key, slot); 908 btrfs_item_key_to_cpu(l, &key, slot);
800 909
@@ -802,48 +911,62 @@ no_more_items:
802 goto next; 911 goto next;
803 912
804 if (key.objectid > device->devid) 913 if (key.objectid > device->devid)
805 goto no_more_items; 914 break;
806 915
807 if (key.offset >= search_start && key.offset > last_byte && 916 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
808 start_found) { 917 goto next;
809 if (last_byte < search_start)
810 last_byte = search_start;
811 hole_size = key.offset - last_byte;
812 918
813 if (hole_size > *max_avail) 919 if (key.offset > search_start) {
814 *max_avail = hole_size; 920 hole_size = key.offset - search_start;
815 921
816 if (key.offset > last_byte && 922 if (hole_size > max_hole_size) {
817 hole_size >= num_bytes) { 923 max_hole_start = search_start;
818 *start = last_byte; 924 max_hole_size = hole_size;
819 goto check_pending; 925 }
926
927 /*
928 * If this free space is greater than which we need,
929 * it must be the max free space that we have found
930 * until now, so max_hole_start must point to the start
931 * of this free space and the length of this free space
932 * is stored in max_hole_size. Thus, we return
933 * max_hole_start and max_hole_size and go back to the
934 * caller.
935 */
936 if (hole_size >= num_bytes) {
937 ret = 0;
938 goto out;
820 } 939 }
821 } 940 }
822 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
823 goto next;
824 941
825 start_found = 1;
826 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 942 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
827 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 943 extent_end = key.offset + btrfs_dev_extent_length(l,
944 dev_extent);
945 if (extent_end > search_start)
946 search_start = extent_end;
828next: 947next:
829 path->slots[0]++; 948 path->slots[0]++;
830 cond_resched(); 949 cond_resched();
831 } 950 }
832check_pending:
833 /* we have to make sure we didn't find an extent that has already
834 * been allocated by the map tree or the original allocation
835 */
836 BUG_ON(*start < search_start);
837 951
838 if (*start + num_bytes > search_end) { 952 hole_size = search_end- search_start;
839 ret = -ENOSPC; 953 if (hole_size > max_hole_size) {
840 goto error; 954 max_hole_start = search_start;
955 max_hole_size = hole_size;
841 } 956 }
842 /* check for pending inserts here */
843 ret = 0;
844 957
845error: 958 /* See above. */
959 if (hole_size < num_bytes)
960 ret = -ENOSPC;
961 else
962 ret = 0;
963
964out:
846 btrfs_free_path(path); 965 btrfs_free_path(path);
966error:
967 *start = max_hole_start;
968 if (len)
969 *len = max_hole_size;
847 return ret; 970 return ret;
848} 971}
849 972
@@ -1089,7 +1212,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1089 if (!path) 1212 if (!path)
1090 return -ENOMEM; 1213 return -ENOMEM;
1091 1214
1092 trans = btrfs_start_transaction(root, 1); 1215 trans = btrfs_start_transaction(root, 0);
1093 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1216 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1094 key.type = BTRFS_DEV_ITEM_KEY; 1217 key.type = BTRFS_DEV_ITEM_KEY;
1095 key.offset = device->devid; 1218 key.offset = device->devid;
@@ -1173,8 +1296,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1173 goto out; 1296 goto out;
1174 } 1297 }
1175 } else { 1298 } else {
1176 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1299 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1177 root->fs_info->bdev_holder); 1300 root->fs_info->bdev_holder);
1178 if (IS_ERR(bdev)) { 1301 if (IS_ERR(bdev)) {
1179 ret = PTR_ERR(bdev); 1302 ret = PTR_ERR(bdev);
1180 goto out; 1303 goto out;
@@ -1183,11 +1306,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1183 set_blocksize(bdev, 4096); 1306 set_blocksize(bdev, 4096);
1184 bh = btrfs_read_dev_super(bdev); 1307 bh = btrfs_read_dev_super(bdev);
1185 if (!bh) { 1308 if (!bh) {
1186 ret = -EIO; 1309 ret = -EINVAL;
1187 goto error_close; 1310 goto error_close;
1188 } 1311 }
1189 disk_super = (struct btrfs_super_block *)bh->b_data; 1312 disk_super = (struct btrfs_super_block *)bh->b_data;
1190 devid = le64_to_cpu(disk_super->dev_item.devid); 1313 devid = btrfs_stack_device_id(&disk_super->dev_item);
1191 dev_uuid = disk_super->dev_item.uuid; 1314 dev_uuid = disk_super->dev_item.uuid;
1192 device = btrfs_find_device(root, devid, dev_uuid, 1315 device = btrfs_find_device(root, devid, dev_uuid,
1193 disk_super->fsid); 1316 disk_super->fsid);
@@ -1230,6 +1353,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1230 1353
1231 device->fs_devices->num_devices--; 1354 device->fs_devices->num_devices--;
1232 1355
1356 if (device->missing)
1357 root->fs_info->fs_devices->missing_devices--;
1358
1233 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1359 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1234 struct btrfs_device, dev_list); 1360 struct btrfs_device, dev_list);
1235 if (device->bdev == root->fs_info->sb->s_bdev) 1361 if (device->bdev == root->fs_info->sb->s_bdev)
@@ -1238,7 +1364,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1238 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1364 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1239 1365
1240 if (device->bdev) { 1366 if (device->bdev) {
1241 close_bdev_exclusive(device->bdev, device->mode); 1367 blkdev_put(device->bdev, device->mode);
1242 device->bdev = NULL; 1368 device->bdev = NULL;
1243 device->fs_devices->open_devices--; 1369 device->fs_devices->open_devices--;
1244 } 1370 }
@@ -1281,7 +1407,7 @@ error_brelse:
1281 brelse(bh); 1407 brelse(bh);
1282error_close: 1408error_close:
1283 if (bdev) 1409 if (bdev)
1284 close_bdev_exclusive(bdev, FMODE_READ); 1410 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1285out: 1411out:
1286 mutex_unlock(&root->fs_info->volume_mutex); 1412 mutex_unlock(&root->fs_info->volume_mutex);
1287 mutex_unlock(&uuid_mutex); 1413 mutex_unlock(&uuid_mutex);
@@ -1433,7 +1559,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1433 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1559 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1434 return -EINVAL; 1560 return -EINVAL;
1435 1561
1436 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1562 bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1563 root->fs_info->bdev_holder);
1437 if (IS_ERR(bdev)) 1564 if (IS_ERR(bdev))
1438 return PTR_ERR(bdev); 1565 return PTR_ERR(bdev);
1439 1566
@@ -1478,10 +1605,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1478 goto error; 1605 goto error;
1479 } 1606 }
1480 1607
1481 trans = btrfs_start_transaction(root, 1); 1608 trans = btrfs_start_transaction(root, 0);
1482 lock_chunks(root); 1609 lock_chunks(root);
1483 1610
1484 device->barriers = 1;
1485 device->writeable = 1; 1611 device->writeable = 1;
1486 device->work.func = pending_bios_fn; 1612 device->work.func = pending_bios_fn;
1487 generate_random_uuid(device->uuid); 1613 generate_random_uuid(device->uuid);
@@ -1560,7 +1686,7 @@ out:
1560 mutex_unlock(&root->fs_info->volume_mutex); 1686 mutex_unlock(&root->fs_info->volume_mutex);
1561 return ret; 1687 return ret;
1562error: 1688error:
1563 close_bdev_exclusive(bdev, 0); 1689 blkdev_put(bdev, FMODE_EXCL);
1564 if (seeding_dev) { 1690 if (seeding_dev) {
1565 mutex_unlock(&uuid_mutex); 1691 mutex_unlock(&uuid_mutex);
1566 up_write(&sb->s_umount); 1692 up_write(&sb->s_umount);
@@ -1743,9 +1869,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1743 1869
1744 /* step one, relocate all the extents inside this chunk */ 1870 /* step one, relocate all the extents inside this chunk */
1745 ret = btrfs_relocate_block_group(extent_root, chunk_offset); 1871 ret = btrfs_relocate_block_group(extent_root, chunk_offset);
1746 BUG_ON(ret); 1872 if (ret)
1873 return ret;
1747 1874
1748 trans = btrfs_start_transaction(root, 1); 1875 trans = btrfs_start_transaction(root, 0);
1749 BUG_ON(!trans); 1876 BUG_ON(!trans);
1750 1877
1751 lock_chunks(root); 1878 lock_chunks(root);
@@ -1892,7 +2019,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
1892 u64 size_to_free; 2019 u64 size_to_free;
1893 struct btrfs_path *path; 2020 struct btrfs_path *path;
1894 struct btrfs_key key; 2021 struct btrfs_key key;
1895 struct btrfs_chunk *chunk;
1896 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 2022 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1897 struct btrfs_trans_handle *trans; 2023 struct btrfs_trans_handle *trans;
1898 struct btrfs_key found_key; 2024 struct btrfs_key found_key;
@@ -1900,6 +2026,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1900 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2026 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1901 return -EROFS; 2027 return -EROFS;
1902 2028
2029 if (!capable(CAP_SYS_ADMIN))
2030 return -EPERM;
2031
1903 mutex_lock(&dev_root->fs_info->volume_mutex); 2032 mutex_lock(&dev_root->fs_info->volume_mutex);
1904 dev_root = dev_root->fs_info->dev_root; 2033 dev_root = dev_root->fs_info->dev_root;
1905 2034
@@ -1917,7 +2046,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1917 break; 2046 break;
1918 BUG_ON(ret); 2047 BUG_ON(ret);
1919 2048
1920 trans = btrfs_start_transaction(dev_root, 1); 2049 trans = btrfs_start_transaction(dev_root, 0);
1921 BUG_ON(!trans); 2050 BUG_ON(!trans);
1922 2051
1923 ret = btrfs_grow_device(trans, device, old_size); 2052 ret = btrfs_grow_device(trans, device, old_size);
@@ -1956,9 +2085,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
1956 if (found_key.objectid != key.objectid) 2085 if (found_key.objectid != key.objectid)
1957 break; 2086 break;
1958 2087
1959 chunk = btrfs_item_ptr(path->nodes[0],
1960 path->slots[0],
1961 struct btrfs_chunk);
1962 /* chunk zero is special */ 2088 /* chunk zero is special */
1963 if (found_key.offset == 0) 2089 if (found_key.offset == 0)
1964 break; 2090 break;
@@ -2086,11 +2212,7 @@ again:
2086 } 2212 }
2087 2213
2088 /* Shrinking succeeded, else we would be at "done". */ 2214 /* Shrinking succeeded, else we would be at "done". */
2089 trans = btrfs_start_transaction(root, 1); 2215 trans = btrfs_start_transaction(root, 0);
2090 if (!trans) {
2091 ret = -ENOMEM;
2092 goto done;
2093 }
2094 lock_chunks(root); 2216 lock_chunks(root);
2095 2217
2096 device->disk_total_bytes = new_size; 2218 device->disk_total_bytes = new_size;
@@ -2145,66 +2267,67 @@ static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
2145 return calc_size * num_stripes; 2267 return calc_size * num_stripes;
2146} 2268}
2147 2269
2148static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2270/* Used to sort the devices by max_avail(descending sort) */
2149 struct btrfs_root *extent_root, 2271int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2)
2150 struct map_lookup **map_ret,
2151 u64 *num_bytes, u64 *stripe_size,
2152 u64 start, u64 type)
2153{ 2272{
2154 struct btrfs_fs_info *info = extent_root->fs_info; 2273 if (((struct btrfs_device_info *)dev_info1)->max_avail >
2155 struct btrfs_device *device = NULL; 2274 ((struct btrfs_device_info *)dev_info2)->max_avail)
2156 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2275 return -1;
2157 struct list_head *cur; 2276 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
2158 struct map_lookup *map = NULL; 2277 ((struct btrfs_device_info *)dev_info2)->max_avail)
2159 struct extent_map_tree *em_tree; 2278 return 1;
2160 struct extent_map *em; 2279 else
2161 struct list_head private_devs; 2280 return 0;
2162 int min_stripe_size = 1 * 1024 * 1024; 2281}
2163 u64 calc_size = 1024 * 1024 * 1024;
2164 u64 max_chunk_size = calc_size;
2165 u64 min_free;
2166 u64 avail;
2167 u64 max_avail = 0;
2168 u64 dev_offset;
2169 int num_stripes = 1;
2170 int min_stripes = 1;
2171 int sub_stripes = 0;
2172 int looped = 0;
2173 int ret;
2174 int index;
2175 int stripe_len = 64 * 1024;
2176 2282
2177 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2283static int __btrfs_calc_nstripes(struct btrfs_fs_devices *fs_devices, u64 type,
2178 (type & BTRFS_BLOCK_GROUP_DUP)) { 2284 int *num_stripes, int *min_stripes,
2179 WARN_ON(1); 2285 int *sub_stripes)
2180 type &= ~BTRFS_BLOCK_GROUP_DUP; 2286{
2181 } 2287 *num_stripes = 1;
2182 if (list_empty(&fs_devices->alloc_list)) 2288 *min_stripes = 1;
2183 return -ENOSPC; 2289 *sub_stripes = 0;
2184 2290
2185 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2291 if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2186 num_stripes = fs_devices->rw_devices; 2292 *num_stripes = fs_devices->rw_devices;
2187 min_stripes = 2; 2293 *min_stripes = 2;
2188 } 2294 }
2189 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2295 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2190 num_stripes = 2; 2296 *num_stripes = 2;
2191 min_stripes = 2; 2297 *min_stripes = 2;
2192 } 2298 }
2193 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2299 if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2194 num_stripes = min_t(u64, 2, fs_devices->rw_devices); 2300 if (fs_devices->rw_devices < 2)
2195 if (num_stripes < 2)
2196 return -ENOSPC; 2301 return -ENOSPC;
2197 min_stripes = 2; 2302 *num_stripes = 2;
2303 *min_stripes = 2;
2198 } 2304 }
2199 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2305 if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2200 num_stripes = fs_devices->rw_devices; 2306 *num_stripes = fs_devices->rw_devices;
2201 if (num_stripes < 4) 2307 if (*num_stripes < 4)
2202 return -ENOSPC; 2308 return -ENOSPC;
2203 num_stripes &= ~(u32)1; 2309 *num_stripes &= ~(u32)1;
2204 sub_stripes = 2; 2310 *sub_stripes = 2;
2205 min_stripes = 4; 2311 *min_stripes = 4;
2206 } 2312 }
2207 2313
2314 return 0;
2315}
2316
2317static u64 __btrfs_calc_stripe_size(struct btrfs_fs_devices *fs_devices,
2318 u64 proposed_size, u64 type,
2319 int num_stripes, int small_stripe)
2320{
2321 int min_stripe_size = 1 * 1024 * 1024;
2322 u64 calc_size = proposed_size;
2323 u64 max_chunk_size = calc_size;
2324 int ncopies = 1;
2325
2326 if (type & (BTRFS_BLOCK_GROUP_RAID1 |
2327 BTRFS_BLOCK_GROUP_DUP |
2328 BTRFS_BLOCK_GROUP_RAID10))
2329 ncopies = 2;
2330
2208 if (type & BTRFS_BLOCK_GROUP_DATA) { 2331 if (type & BTRFS_BLOCK_GROUP_DATA) {
2209 max_chunk_size = 10 * calc_size; 2332 max_chunk_size = 10 * calc_size;
2210 min_stripe_size = 64 * 1024 * 1024; 2333 min_stripe_size = 64 * 1024 * 1024;
@@ -2221,43 +2344,209 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2221 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2344 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2222 max_chunk_size); 2345 max_chunk_size);
2223 2346
2224again: 2347 if (calc_size * num_stripes > max_chunk_size * ncopies) {
2225 max_avail = 0; 2348 calc_size = max_chunk_size * ncopies;
2226 if (!map || map->num_stripes != num_stripes) { 2349 do_div(calc_size, num_stripes);
2227 kfree(map); 2350 do_div(calc_size, BTRFS_STRIPE_LEN);
2228 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); 2351 calc_size *= BTRFS_STRIPE_LEN;
2229 if (!map) 2352 }
2230 return -ENOMEM; 2353
2354 /* we don't want tiny stripes */
2355 if (!small_stripe)
2356 calc_size = max_t(u64, min_stripe_size, calc_size);
2357
2358 /*
2359 * we're about to do_div by the BTRFS_STRIPE_LEN so lets make sure
2360 * we end up with something bigger than a stripe
2361 */
2362 calc_size = max_t(u64, calc_size, BTRFS_STRIPE_LEN);
2363
2364 do_div(calc_size, BTRFS_STRIPE_LEN);
2365 calc_size *= BTRFS_STRIPE_LEN;
2366
2367 return calc_size;
2368}
2369
2370static struct map_lookup *__shrink_map_lookup_stripes(struct map_lookup *map,
2371 int num_stripes)
2372{
2373 struct map_lookup *new;
2374 size_t len = map_lookup_size(num_stripes);
2375
2376 BUG_ON(map->num_stripes < num_stripes);
2377
2378 if (map->num_stripes == num_stripes)
2379 return map;
2380
2381 new = kmalloc(len, GFP_NOFS);
2382 if (!new) {
2383 /* just change map->num_stripes */
2231 map->num_stripes = num_stripes; 2384 map->num_stripes = num_stripes;
2385 return map;
2232 } 2386 }
2233 2387
2234 if (calc_size * num_stripes > max_chunk_size) { 2388 memcpy(new, map, len);
2235 calc_size = max_chunk_size; 2389 new->num_stripes = num_stripes;
2236 do_div(calc_size, num_stripes); 2390 kfree(map);
2237 do_div(calc_size, stripe_len); 2391 return new;
2238 calc_size *= stripe_len; 2392}
2393
2394/*
2395 * helper to allocate device space from btrfs_device_info, in which we stored
2396 * max free space information of every device. It is used when we can not
2397 * allocate chunks by default size.
2398 *
2399 * By this helper, we can allocate a new chunk as larger as possible.
2400 */
2401static int __btrfs_alloc_tiny_space(struct btrfs_trans_handle *trans,
2402 struct btrfs_fs_devices *fs_devices,
2403 struct btrfs_device_info *devices,
2404 int nr_device, u64 type,
2405 struct map_lookup **map_lookup,
2406 int min_stripes, u64 *stripe_size)
2407{
2408 int i, index, sort_again = 0;
2409 int min_devices = min_stripes;
2410 u64 max_avail, min_free;
2411 struct map_lookup *map = *map_lookup;
2412 int ret;
2413
2414 if (nr_device < min_stripes)
2415 return -ENOSPC;
2416
2417 btrfs_descending_sort_devices(devices, nr_device);
2418
2419 max_avail = devices[0].max_avail;
2420 if (!max_avail)
2421 return -ENOSPC;
2422
2423 for (i = 0; i < nr_device; i++) {
2424 /*
2425 * if dev_offset = 0, it means the free space of this device
2426 * is less than what we need, and we didn't search max avail
2427 * extent on this device, so do it now.
2428 */
2429 if (!devices[i].dev_offset) {
2430 ret = find_free_dev_extent(trans, devices[i].dev,
2431 max_avail,
2432 &devices[i].dev_offset,
2433 &devices[i].max_avail);
2434 if (ret != 0 && ret != -ENOSPC)
2435 return ret;
2436 sort_again = 1;
2437 }
2239 } 2438 }
2240 /* we don't want tiny stripes */
2241 calc_size = max_t(u64, min_stripe_size, calc_size);
2242 2439
2243 do_div(calc_size, stripe_len); 2440 /* we update the max avail free extent of each devices, sort again */
2244 calc_size *= stripe_len; 2441 if (sort_again)
2442 btrfs_descending_sort_devices(devices, nr_device);
2443
2444 if (type & BTRFS_BLOCK_GROUP_DUP)
2445 min_devices = 1;
2446
2447 if (!devices[min_devices - 1].max_avail)
2448 return -ENOSPC;
2449
2450 max_avail = devices[min_devices - 1].max_avail;
2451 if (type & BTRFS_BLOCK_GROUP_DUP)
2452 do_div(max_avail, 2);
2453
2454 max_avail = __btrfs_calc_stripe_size(fs_devices, max_avail, type,
2455 min_stripes, 1);
2456 if (type & BTRFS_BLOCK_GROUP_DUP)
2457 min_free = max_avail * 2;
2458 else
2459 min_free = max_avail;
2460
2461 if (min_free > devices[min_devices - 1].max_avail)
2462 return -ENOSPC;
2463
2464 map = __shrink_map_lookup_stripes(map, min_stripes);
2465 *stripe_size = max_avail;
2466
2467 index = 0;
2468 for (i = 0; i < min_stripes; i++) {
2469 map->stripes[i].dev = devices[index].dev;
2470 map->stripes[i].physical = devices[index].dev_offset;
2471 if (type & BTRFS_BLOCK_GROUP_DUP) {
2472 i++;
2473 map->stripes[i].dev = devices[index].dev;
2474 map->stripes[i].physical = devices[index].dev_offset +
2475 max_avail;
2476 }
2477 index++;
2478 }
2479 *map_lookup = map;
2480
2481 return 0;
2482}
2483
2484static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2485 struct btrfs_root *extent_root,
2486 struct map_lookup **map_ret,
2487 u64 *num_bytes, u64 *stripe_size,
2488 u64 start, u64 type)
2489{
2490 struct btrfs_fs_info *info = extent_root->fs_info;
2491 struct btrfs_device *device = NULL;
2492 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2493 struct list_head *cur;
2494 struct map_lookup *map;
2495 struct extent_map_tree *em_tree;
2496 struct extent_map *em;
2497 struct btrfs_device_info *devices_info;
2498 struct list_head private_devs;
2499 u64 calc_size = 1024 * 1024 * 1024;
2500 u64 min_free;
2501 u64 avail;
2502 u64 dev_offset;
2503 int num_stripes;
2504 int min_stripes;
2505 int sub_stripes;
2506 int min_devices; /* the min number of devices we need */
2507 int i;
2508 int ret;
2509 int index;
2510
2511 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2512 (type & BTRFS_BLOCK_GROUP_DUP)) {
2513 WARN_ON(1);
2514 type &= ~BTRFS_BLOCK_GROUP_DUP;
2515 }
2516 if (list_empty(&fs_devices->alloc_list))
2517 return -ENOSPC;
2518
2519 ret = __btrfs_calc_nstripes(fs_devices, type, &num_stripes,
2520 &min_stripes, &sub_stripes);
2521 if (ret)
2522 return ret;
2523
2524 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2525 GFP_NOFS);
2526 if (!devices_info)
2527 return -ENOMEM;
2528
2529 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2530 if (!map) {
2531 ret = -ENOMEM;
2532 goto error;
2533 }
2534 map->num_stripes = num_stripes;
2245 2535
2246 cur = fs_devices->alloc_list.next; 2536 cur = fs_devices->alloc_list.next;
2247 index = 0; 2537 index = 0;
2538 i = 0;
2248 2539
2249 if (type & BTRFS_BLOCK_GROUP_DUP) 2540 calc_size = __btrfs_calc_stripe_size(fs_devices, calc_size, type,
2541 num_stripes, 0);
2542
2543 if (type & BTRFS_BLOCK_GROUP_DUP) {
2250 min_free = calc_size * 2; 2544 min_free = calc_size * 2;
2251 else 2545 min_devices = 1;
2546 } else {
2252 min_free = calc_size; 2547 min_free = calc_size;
2253 2548 min_devices = min_stripes;
2254 /* 2549 }
2255 * we add 1MB because we never use the first 1MB of the device, unless
2256 * we've looped, then we are likely allocating the maximum amount of
2257 * space left already
2258 */
2259 if (!looped)
2260 min_free += 1024 * 1024;
2261 2550
2262 INIT_LIST_HEAD(&private_devs); 2551 INIT_LIST_HEAD(&private_devs);
2263 while (index < num_stripes) { 2552 while (index < num_stripes) {
@@ -2270,27 +2559,39 @@ again:
2270 cur = cur->next; 2559 cur = cur->next;
2271 2560
2272 if (device->in_fs_metadata && avail >= min_free) { 2561 if (device->in_fs_metadata && avail >= min_free) {
2273 ret = find_free_dev_extent(trans, device, 2562 ret = find_free_dev_extent(trans, device, min_free,
2274 min_free, &dev_offset, 2563 &devices_info[i].dev_offset,
2275 &max_avail); 2564 &devices_info[i].max_avail);
2276 if (ret == 0) { 2565 if (ret == 0) {
2277 list_move_tail(&device->dev_alloc_list, 2566 list_move_tail(&device->dev_alloc_list,
2278 &private_devs); 2567 &private_devs);
2279 map->stripes[index].dev = device; 2568 map->stripes[index].dev = device;
2280 map->stripes[index].physical = dev_offset; 2569 map->stripes[index].physical =
2570 devices_info[i].dev_offset;
2281 index++; 2571 index++;
2282 if (type & BTRFS_BLOCK_GROUP_DUP) { 2572 if (type & BTRFS_BLOCK_GROUP_DUP) {
2283 map->stripes[index].dev = device; 2573 map->stripes[index].dev = device;
2284 map->stripes[index].physical = 2574 map->stripes[index].physical =
2285 dev_offset + calc_size; 2575 devices_info[i].dev_offset +
2576 calc_size;
2286 index++; 2577 index++;
2287 } 2578 }
2288 } 2579 } else if (ret != -ENOSPC)
2289 } else if (device->in_fs_metadata && avail > max_avail) 2580 goto error;
2290 max_avail = avail; 2581
2582 devices_info[i].dev = device;
2583 i++;
2584 } else if (device->in_fs_metadata &&
2585 avail >= BTRFS_STRIPE_LEN) {
2586 devices_info[i].dev = device;
2587 devices_info[i].max_avail = avail;
2588 i++;
2589 }
2590
2291 if (cur == &fs_devices->alloc_list) 2591 if (cur == &fs_devices->alloc_list)
2292 break; 2592 break;
2293 } 2593 }
2594
2294 list_splice(&private_devs, &fs_devices->alloc_list); 2595 list_splice(&private_devs, &fs_devices->alloc_list);
2295 if (index < num_stripes) { 2596 if (index < num_stripes) {
2296 if (index >= min_stripes) { 2597 if (index >= min_stripes) {
@@ -2299,34 +2600,36 @@ again:
2299 num_stripes /= sub_stripes; 2600 num_stripes /= sub_stripes;
2300 num_stripes *= sub_stripes; 2601 num_stripes *= sub_stripes;
2301 } 2602 }
2302 looped = 1; 2603
2303 goto again; 2604 map = __shrink_map_lookup_stripes(map, num_stripes);
2304 } 2605 } else if (i >= min_devices) {
2305 if (!looped && max_avail > 0) { 2606 ret = __btrfs_alloc_tiny_space(trans, fs_devices,
2306 looped = 1; 2607 devices_info, i, type,
2307 calc_size = max_avail; 2608 &map, min_stripes,
2308 goto again; 2609 &calc_size);
2610 if (ret)
2611 goto error;
2612 } else {
2613 ret = -ENOSPC;
2614 goto error;
2309 } 2615 }
2310 kfree(map);
2311 return -ENOSPC;
2312 } 2616 }
2313 map->sector_size = extent_root->sectorsize; 2617 map->sector_size = extent_root->sectorsize;
2314 map->stripe_len = stripe_len; 2618 map->stripe_len = BTRFS_STRIPE_LEN;
2315 map->io_align = stripe_len; 2619 map->io_align = BTRFS_STRIPE_LEN;
2316 map->io_width = stripe_len; 2620 map->io_width = BTRFS_STRIPE_LEN;
2317 map->type = type; 2621 map->type = type;
2318 map->num_stripes = num_stripes;
2319 map->sub_stripes = sub_stripes; 2622 map->sub_stripes = sub_stripes;
2320 2623
2321 *map_ret = map; 2624 *map_ret = map;
2322 *stripe_size = calc_size; 2625 *stripe_size = calc_size;
2323 *num_bytes = chunk_bytes_by_type(type, calc_size, 2626 *num_bytes = chunk_bytes_by_type(type, calc_size,
2324 num_stripes, sub_stripes); 2627 map->num_stripes, sub_stripes);
2325 2628
2326 em = alloc_extent_map(GFP_NOFS); 2629 em = alloc_extent_map(GFP_NOFS);
2327 if (!em) { 2630 if (!em) {
2328 kfree(map); 2631 ret = -ENOMEM;
2329 return -ENOMEM; 2632 goto error;
2330 } 2633 }
2331 em->bdev = (struct block_device *)map; 2634 em->bdev = (struct block_device *)map;
2332 em->start = start; 2635 em->start = start;
@@ -2359,7 +2662,13 @@ again:
2359 index++; 2662 index++;
2360 } 2663 }
2361 2664
2665 kfree(devices_info);
2362 return 0; 2666 return 0;
2667
2668error:
2669 kfree(map);
2670 kfree(devices_info);
2671 return ret;
2363} 2672}
2364 2673
2365static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2674static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -2638,7 +2947,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2638 int max_errors = 0; 2947 int max_errors = 0;
2639 struct btrfs_multi_bio *multi = NULL; 2948 struct btrfs_multi_bio *multi = NULL;
2640 2949
2641 if (multi_ret && !(rw & (1 << BIO_RW))) 2950 if (multi_ret && !(rw & REQ_WRITE))
2642 stripes_allocated = 1; 2951 stripes_allocated = 1;
2643again: 2952again:
2644 if (multi_ret) { 2953 if (multi_ret) {
@@ -2674,7 +2983,7 @@ again:
2674 mirror_num = 0; 2983 mirror_num = 0;
2675 2984
2676 /* if our multi bio struct is too small, back off and try again */ 2985 /* if our multi bio struct is too small, back off and try again */
2677 if (rw & (1 << BIO_RW)) { 2986 if (rw & REQ_WRITE) {
2678 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2987 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2679 BTRFS_BLOCK_GROUP_DUP)) { 2988 BTRFS_BLOCK_GROUP_DUP)) {
2680 stripes_required = map->num_stripes; 2989 stripes_required = map->num_stripes;
@@ -2684,7 +2993,7 @@ again:
2684 max_errors = 1; 2993 max_errors = 1;
2685 } 2994 }
2686 } 2995 }
2687 if (multi_ret && (rw & (1 << BIO_RW)) && 2996 if (multi_ret && (rw & REQ_WRITE) &&
2688 stripes_allocated < stripes_required) { 2997 stripes_allocated < stripes_required) {
2689 stripes_allocated = map->num_stripes; 2998 stripes_allocated = map->num_stripes;
2690 free_extent_map(em); 2999 free_extent_map(em);
@@ -2720,7 +3029,7 @@ again:
2720 num_stripes = 1; 3029 num_stripes = 1;
2721 stripe_index = 0; 3030 stripe_index = 0;
2722 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 3031 if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2723 if (unplug_page || (rw & (1 << BIO_RW))) 3032 if (unplug_page || (rw & REQ_WRITE))
2724 num_stripes = map->num_stripes; 3033 num_stripes = map->num_stripes;
2725 else if (mirror_num) 3034 else if (mirror_num)
2726 stripe_index = mirror_num - 1; 3035 stripe_index = mirror_num - 1;
@@ -2731,7 +3040,7 @@ again:
2731 } 3040 }
2732 3041
2733 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3042 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2734 if (rw & (1 << BIO_RW)) 3043 if (rw & REQ_WRITE)
2735 num_stripes = map->num_stripes; 3044 num_stripes = map->num_stripes;
2736 else if (mirror_num) 3045 else if (mirror_num)
2737 stripe_index = mirror_num - 1; 3046 stripe_index = mirror_num - 1;
@@ -2742,7 +3051,7 @@ again:
2742 stripe_index = do_div(stripe_nr, factor); 3051 stripe_index = do_div(stripe_nr, factor);
2743 stripe_index *= map->sub_stripes; 3052 stripe_index *= map->sub_stripes;
2744 3053
2745 if (unplug_page || (rw & (1 << BIO_RW))) 3054 if (unplug_page || (rw & REQ_WRITE))
2746 num_stripes = map->sub_stripes; 3055 num_stripes = map->sub_stripes;
2747 else if (mirror_num) 3056 else if (mirror_num)
2748 stripe_index += mirror_num - 1; 3057 stripe_index += mirror_num - 1;
@@ -2932,7 +3241,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2932 struct btrfs_pending_bios *pending_bios; 3241 struct btrfs_pending_bios *pending_bios;
2933 3242
2934 /* don't bother with additional async steps for reads, right now */ 3243 /* don't bother with additional async steps for reads, right now */
2935 if (!(rw & (1 << BIO_RW))) { 3244 if (!(rw & REQ_WRITE)) {
2936 bio_get(bio); 3245 bio_get(bio);
2937 submit_bio(rw, bio); 3246 submit_bio(rw, bio);
2938 bio_put(bio); 3247 bio_put(bio);
@@ -2951,7 +3260,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
2951 bio->bi_rw |= rw; 3260 bio->bi_rw |= rw;
2952 3261
2953 spin_lock(&device->io_lock); 3262 spin_lock(&device->io_lock);
2954 if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) 3263 if (bio->bi_rw & REQ_SYNC)
2955 pending_bios = &device->pending_sync_bios; 3264 pending_bios = &device->pending_sync_bios;
2956 else 3265 else
2957 pending_bios = &device->pending_bios; 3266 pending_bios = &device->pending_bios;
@@ -3021,8 +3330,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3021 } 3330 }
3022 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3331 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3023 dev = multi->stripes[dev_nr].dev; 3332 dev = multi->stripes[dev_nr].dev;
3024 BUG_ON(rw == WRITE && !dev->writeable); 3333 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3025 if (dev && dev->bdev) {
3026 bio->bi_bdev = dev->bdev; 3334 bio->bi_bdev = dev->bdev;
3027 if (async_submit) 3335 if (async_submit)
3028 schedule_bio(root, dev, rw, bio); 3336 schedule_bio(root, dev, rw, bio);
@@ -3071,12 +3379,13 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3071 return NULL; 3379 return NULL;
3072 list_add(&device->dev_list, 3380 list_add(&device->dev_list,
3073 &fs_devices->devices); 3381 &fs_devices->devices);
3074 device->barriers = 1;
3075 device->dev_root = root->fs_info->dev_root; 3382 device->dev_root = root->fs_info->dev_root;
3076 device->devid = devid; 3383 device->devid = devid;
3077 device->work.func = pending_bios_fn; 3384 device->work.func = pending_bios_fn;
3078 device->fs_devices = fs_devices; 3385 device->fs_devices = fs_devices;
3386 device->missing = 1;
3079 fs_devices->num_devices++; 3387 fs_devices->num_devices++;
3388 fs_devices->missing_devices++;
3080 spin_lock_init(&device->io_lock); 3389 spin_lock_init(&device->io_lock);
3081 INIT_LIST_HEAD(&device->dev_alloc_list); 3390 INIT_LIST_HEAD(&device->dev_alloc_list);
3082 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3391 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3274,6 +3583,15 @@ static int read_one_dev(struct btrfs_root *root,
3274 device = add_missing_dev(root, devid, dev_uuid); 3583 device = add_missing_dev(root, devid, dev_uuid);
3275 if (!device) 3584 if (!device)
3276 return -ENOMEM; 3585 return -ENOMEM;
3586 } else if (!device->missing) {
3587 /*
3588 * this happens when a device that was properly setup
3589 * in the device info lists suddenly goes bad.
3590 * device->bdev is NULL, and so we have to set
3591 * device->missing to one here
3592 */
3593 root->fs_info->fs_devices->missing_devices++;
3594 device->missing = 1;
3277 } 3595 }
3278 } 3596 }
3279 3597
@@ -3382,6 +3700,8 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
3382 key.type = 0; 3700 key.type = 0;
3383again: 3701again:
3384 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3702 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3703 if (ret < 0)
3704 goto error;
3385 while (1) { 3705 while (1) {
3386 leaf = path->nodes[0]; 3706 leaf = path->nodes[0];
3387 slot = path->slots[0]; 3707 slot = path->slots[0];
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..7fb59d45fe8c 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -42,15 +45,15 @@ struct btrfs_device {
42 int running_pending; 45 int running_pending;
43 u64 generation; 46 u64 generation;
44 47
45 int barriers;
46 int writeable; 48 int writeable;
47 int in_fs_metadata; 49 int in_fs_metadata;
50 int missing;
48 51
49 spinlock_t io_lock; 52 spinlock_t io_lock;
50 53
51 struct block_device *bdev; 54 struct block_device *bdev;
52 55
53 /* the mode sent to open_bdev_exclusive */ 56 /* the mode sent to blkdev_get */
54 fmode_t mode; 57 fmode_t mode;
55 58
56 char *name; 59 char *name;
@@ -94,6 +97,7 @@ struct btrfs_fs_devices {
94 u64 num_devices; 97 u64 num_devices;
95 u64 open_devices; 98 u64 open_devices;
96 u64 rw_devices; 99 u64 rw_devices;
100 u64 missing_devices;
97 u64 total_rw_bytes; 101 u64 total_rw_bytes;
98 struct block_device *latest_bdev; 102 struct block_device *latest_bdev;
99 103
@@ -135,6 +139,30 @@ struct btrfs_multi_bio {
135 struct btrfs_bio_stripe stripes[]; 139 struct btrfs_bio_stripe stripes[];
136}; 140};
137 141
142struct btrfs_device_info {
143 struct btrfs_device *dev;
144 u64 dev_offset;
145 u64 max_avail;
146};
147
148/* Used to sort the devices by max_avail(descending sort) */
149int btrfs_cmp_device_free_bytes(const void *dev_info1, const void *dev_info2);
150
151/*
152 * sort the devices by max_avail, in which max free extent size of each device
153 * is stored.(Descending Sort)
154 */
155static inline void btrfs_descending_sort_devices(
156 struct btrfs_device_info *devices,
157 size_t nr_devices)
158{
159 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
160 btrfs_cmp_device_free_bytes, NULL);
161}
162
163int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
164 u64 end, u64 *length);
165
138#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 166#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
139 (sizeof(struct btrfs_bio_stripe) * (n))) 167 (sizeof(struct btrfs_bio_stripe) * (n)))
140 168
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 193b58f7d3f3..a5776531dc2b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
154 if (trans) 154 if (trans)
155 return do_setxattr(trans, inode, name, value, size, flags); 155 return do_setxattr(trans, inode, name, value, size, flags);
156 156
157 ret = btrfs_reserve_metadata_space(root, 2); 157 trans = btrfs_start_transaction(root, 2);
158 if (ret) 158 if (IS_ERR(trans))
159 return ret; 159 return PTR_ERR(trans);
160 160
161 trans = btrfs_start_transaction(root, 1);
162 if (!trans) {
163 ret = -ENOMEM;
164 goto out;
165 }
166 btrfs_set_trans_block_group(trans, inode); 161 btrfs_set_trans_block_group(trans, inode);
167 162
168 ret = do_setxattr(trans, inode, name, value, size, flags); 163 ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
174 BUG_ON(ret); 169 BUG_ON(ret);
175out: 170out:
176 btrfs_end_transaction_throttle(trans, root); 171 btrfs_end_transaction_throttle(trans, root);
177 btrfs_unreserve_metadata_space(root, 2);
178 return ret; 172 return ret;
179} 173}
180 174
@@ -184,7 +178,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
184 struct inode *inode = dentry->d_inode; 178 struct inode *inode = dentry->d_inode;
185 struct btrfs_root *root = BTRFS_I(inode)->root; 179 struct btrfs_root *root = BTRFS_I(inode)->root;
186 struct btrfs_path *path; 180 struct btrfs_path *path;
187 struct btrfs_item *item;
188 struct extent_buffer *leaf; 181 struct extent_buffer *leaf;
189 struct btrfs_dir_item *di; 182 struct btrfs_dir_item *di;
190 int ret = 0, slot, advance; 183 int ret = 0, slot, advance;
@@ -240,7 +233,6 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
240 } 233 }
241 advance = 1; 234 advance = 1;
242 235
243 item = btrfs_item_nr(leaf, slot);
244 btrfs_item_key_to_cpu(leaf, &found_key, slot); 236 btrfs_item_key_to_cpu(leaf, &found_key, slot);
245 237
246 /* check to make sure this item is what we want */ 238 /* check to make sure this item is what we want */
@@ -282,7 +274,7 @@ err:
282 * List of handlers for synthetic system.* attributes. All real ondisk 274 * List of handlers for synthetic system.* attributes. All real ondisk
283 * attributes are handled directly. 275 * attributes are handled directly.
284 */ 276 */
285struct xattr_handler *btrfs_xattr_handlers[] = { 277const struct xattr_handler *btrfs_xattr_handlers[] = {
286#ifdef CONFIG_BTRFS_FS_POSIX_ACL 278#ifdef CONFIG_BTRFS_FS_POSIX_ACL
287 &btrfs_xattr_acl_access_handler, 279 &btrfs_xattr_acl_access_handler,
288 &btrfs_xattr_acl_default_handler, 280 &btrfs_xattr_acl_default_handler,
@@ -324,6 +316,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
324int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, 316int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
325 size_t size, int flags) 317 size_t size, int flags)
326{ 318{
319 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
320
321 /*
322 * The permission on security.* and system.* is not checked
323 * in permission().
324 */
325 if (btrfs_root_readonly(root))
326 return -EROFS;
327
327 /* 328 /*
328 * If this is a request for a synthetic attribute in the system.* 329 * If this is a request for a synthetic attribute in the system.*
329 * namespace use the generic infrastructure to resolve a handler 330 * namespace use the generic infrastructure to resolve a handler
@@ -344,6 +345,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
344 345
345int btrfs_removexattr(struct dentry *dentry, const char *name) 346int btrfs_removexattr(struct dentry *dentry, const char *name)
346{ 347{
348 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
349
350 /*
351 * The permission on security.* and system.* is not checked
352 * in permission().
353 */
354 if (btrfs_root_readonly(root))
355 return -EROFS;
356
347 /* 357 /*
348 * If this is a request for a synthetic attribute in the system.* 358 * If this is a request for a synthetic attribute in the system.*
349 * namespace use the generic infrastructure to resolve a handler 359 * namespace use the generic infrastructure to resolve a handler
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 721efa0346e0..7a43fd640bbb 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -21,9 +21,9 @@
21 21
22#include <linux/xattr.h> 22#include <linux/xattr.h>
23 23
24extern struct xattr_handler btrfs_xattr_acl_access_handler; 24extern const struct xattr_handler btrfs_xattr_acl_access_handler;
25extern struct xattr_handler btrfs_xattr_acl_default_handler; 25extern const struct xattr_handler btrfs_xattr_acl_default_handler;
26extern struct xattr_handler *btrfs_xattr_handlers[]; 26extern const struct xattr_handler *btrfs_xattr_handlers[];
27 27
28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, 28extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
29 void *buffer, size_t size); 29 void *buffer, size_t size);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa239..f5ec2d44150d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include "compression.h" 33#include "compression.h"
34 34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace { 35struct workspace {
45 z_stream inf_strm; 36 z_stream inf_strm;
46 z_stream def_strm; 37 z_stream def_strm;
@@ -48,169 +39,62 @@ struct workspace {
48 struct list_head list; 39 struct list_head list;
49}; 40};
50 41
51static LIST_HEAD(idle_workspace); 42static void zlib_free_workspace(struct list_head *ws)
52static DEFINE_SPINLOCK(workspace_lock); 43{
53static unsigned long num_workspace; 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56 45
57/* 46 vfree(workspace->def_strm.workspace);
58 * this finds an available zlib workspace or allocates a new one 47 vfree(workspace->inf_strm.workspace);
59 * NULL or an ERR_PTR is returned if things go bad. 48 kfree(workspace->buf);
60 */ 49 kfree(workspace);
61static struct workspace *find_zlib_workspace(void) 50}
51
52static struct list_head *zlib_alloc_workspace(void)
62{ 53{
63 struct workspace *workspace; 54 struct workspace *workspace;
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76 55
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS); 56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) { 57 if (!workspace)
90 ret = -ENOMEM; 58 return ERR_PTR(-ENOMEM);
91 goto fail;
92 }
93 59
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); 61 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); 62 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) { 63 if (!workspace->def_strm.workspace ||
106 ret = -ENOMEM; 64 !workspace->inf_strm.workspace || !workspace->buf)
107 goto fail_kmalloc; 65 goto fail;
108 }
109 return workspace;
110
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf);
141 kfree(workspace);
142 66
143 atomic_dec(&alloc_workspace); 67 INIT_LIST_HEAD(&workspace->list);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147}
148 68
149/* 69 return &workspace->list;
150 * cleanup function for module exit 70fail:
151 */ 71 zlib_free_workspace(&workspace->list);
152static void free_workspaces(void) 72 return ERR_PTR(-ENOMEM);
153{
154 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) {
156 workspace = list_entry(idle_workspace.next, struct workspace,
157 list);
158 list_del(&workspace->list);
159 vfree(workspace->def_strm.workspace);
160 vfree(workspace->inf_strm.workspace);
161 kfree(workspace->buf);
162 kfree(workspace);
163 atomic_dec(&alloc_workspace);
164 }
165} 73}
166 74
167/* 75static int zlib_compress_pages(struct list_head *ws,
168 * given an address space and start/len, compress the bytes. 76 struct address_space *mapping,
169 * 77 u64 start, unsigned long len,
170 * pages are allocated to hold the compressed result and stored 78 struct page **pages,
171 * in 'pages' 79 unsigned long nr_dest_pages,
172 * 80 unsigned long *out_pages,
173 * out_pages is used to return the number of pages allocated. There 81 unsigned long *total_in,
174 * may be pages allocated even if we return an error 82 unsigned long *total_out,
175 * 83 unsigned long max_out)
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{ 84{
85 struct workspace *workspace = list_entry(ws, struct workspace, list);
195 int ret; 86 int ret;
196 struct workspace *workspace;
197 char *data_in; 87 char *data_in;
198 char *cpage_out; 88 char *cpage_out;
199 int nr_pages = 0; 89 int nr_pages = 0;
200 struct page *in_page = NULL; 90 struct page *in_page = NULL;
201 struct page *out_page = NULL; 91 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left; 92 unsigned long bytes_left;
205 93
206 *out_pages = 0; 94 *out_pages = 0;
207 *total_out = 0; 95 *total_out = 0;
208 *total_in = 0; 96 *total_in = 0;
209 97
210 workspace = find_zlib_workspace();
211 if (IS_ERR(workspace))
212 return -1;
213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 98 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
215 printk(KERN_WARNING "deflateInit failed\n"); 99 printk(KERN_WARNING "deflateInit failed\n");
216 ret = -1; 100 ret = -1;
@@ -224,6 +108,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
224 data_in = kmap(in_page); 108 data_in = kmap(in_page);
225 109
226 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 110 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
111 if (out_page == NULL) {
112 ret = -1;
113 goto out;
114 }
227 cpage_out = kmap(out_page); 115 cpage_out = kmap(out_page);
228 pages[0] = out_page; 116 pages[0] = out_page;
229 nr_pages = 1; 117 nr_pages = 1;
@@ -233,9 +121,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 121 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); 122 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235 123
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) { 124 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 125 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) { 126 if (ret != Z_OK) {
@@ -265,6 +150,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
265 goto out; 150 goto out;
266 } 151 }
267 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 152 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
153 if (out_page == NULL) {
154 ret = -1;
155 goto out;
156 }
268 cpage_out = kmap(out_page); 157 cpage_out = kmap(out_page);
269 pages[nr_pages] = out_page; 158 pages[nr_pages] = out_page;
270 nr_pages++; 159 nr_pages++;
@@ -319,55 +208,26 @@ out:
319 kunmap(in_page); 208 kunmap(in_page);
320 page_cache_release(in_page); 209 page_cache_release(in_page);
321 } 210 }
322 free_workspace(workspace);
323 return ret; 211 return ret;
324} 212}
325 213
326/* 214static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
327 * pages_in is an array of pages with compressed data. 215 u64 disk_start,
328 * 216 struct bio_vec *bvec,
329 * disk_start is the starting logical offset of this array in the file 217 int vcnt,
330 * 218 size_t srclen)
331 * bvec is a bio_vec of pages from the file that we want to decompress into
332 *
333 * vcnt is the count of pages in the biovec
334 *
335 * srclen is the number of bytes in pages_in
336 *
337 * The basic idea is that we have a bio that was created by readpages.
338 * The pages in the bio are for the uncompressed data, and they may not
339 * be contiguous. They all correspond to the range of bytes covered by
340 * the compressed extent.
341 */
342int btrfs_zlib_decompress_biovec(struct page **pages_in,
343 u64 disk_start,
344 struct bio_vec *bvec,
345 int vcnt,
346 size_t srclen)
347{ 219{
348 int ret = 0; 220 struct workspace *workspace = list_entry(ws, struct workspace, list);
221 int ret = 0, ret2;
349 int wbits = MAX_WBITS; 222 int wbits = MAX_WBITS;
350 struct workspace *workspace;
351 char *data_in; 223 char *data_in;
352 size_t total_out = 0; 224 size_t total_out = 0;
353 unsigned long page_bytes_left;
354 unsigned long page_in_index = 0; 225 unsigned long page_in_index = 0;
355 unsigned long page_out_index = 0; 226 unsigned long page_out_index = 0;
356 struct page *page_out;
357 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 227 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
358 PAGE_CACHE_SIZE; 228 PAGE_CACHE_SIZE;
359 unsigned long buf_start; 229 unsigned long buf_start;
360 unsigned long buf_offset;
361 unsigned long bytes;
362 unsigned long working_bytes;
363 unsigned long pg_offset; 230 unsigned long pg_offset;
364 unsigned long start_byte;
365 unsigned long current_buf_start;
366 char *kaddr;
367
368 workspace = find_zlib_workspace();
369 if (IS_ERR(workspace))
370 return -ENOMEM;
371 231
372 data_in = kmap(pages_in[page_in_index]); 232 data_in = kmap(pages_in[page_in_index]);
373 workspace->inf_strm.next_in = data_in; 233 workspace->inf_strm.next_in = data_in;
@@ -377,8 +237,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
377 workspace->inf_strm.total_out = 0; 237 workspace->inf_strm.total_out = 0;
378 workspace->inf_strm.next_out = workspace->buf; 238 workspace->inf_strm.next_out = workspace->buf;
379 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 239 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
380 page_out = bvec[page_out_index].bv_page;
381 page_bytes_left = PAGE_CACHE_SIZE;
382 pg_offset = 0; 240 pg_offset = 0;
383 241
384 /* If it's deflate, and it's got no preset dictionary, then 242 /* If it's deflate, and it's got no preset dictionary, then
@@ -394,107 +252,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
394 252
395 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 253 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
396 printk(KERN_WARNING "inflateInit failed\n"); 254 printk(KERN_WARNING "inflateInit failed\n");
397 ret = -1; 255 return -1;
398 goto out;
399 } 256 }
400 while (workspace->inf_strm.total_in < srclen) { 257 while (workspace->inf_strm.total_in < srclen) {
401 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 258 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
402 if (ret != Z_OK && ret != Z_STREAM_END) 259 if (ret != Z_OK && ret != Z_STREAM_END)
403 break; 260 break;
404 /*
405 * buf start is the byte offset we're of the start of
406 * our workspace buffer
407 */
408 buf_start = total_out;
409 261
410 /* total_out is the last byte of the workspace buffer */ 262 buf_start = total_out;
411 total_out = workspace->inf_strm.total_out; 263 total_out = workspace->inf_strm.total_out;
412 264
413 working_bytes = total_out - buf_start; 265 /* we didn't make progress in this inflate call, we're done */
414 266 if (buf_start == total_out)
415 /*
416 * start byte is the first byte of the page we're currently
417 * copying into relative to the start of the compressed data.
418 */
419 start_byte = page_offset(page_out) - disk_start;
420
421 if (working_bytes == 0) {
422 /* we didn't make progress in this inflate
423 * call, we're done
424 */
425 if (ret != Z_STREAM_END)
426 ret = -1;
427 break; 267 break;
428 }
429 268
430 /* we haven't yet hit data corresponding to this page */ 269 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
431 if (total_out <= start_byte) 270 total_out, disk_start,
432 goto next; 271 bvec, vcnt,
433 272 &page_out_index, &pg_offset);
434 /* 273 if (ret2 == 0) {
435 * the start of the data we care about is offset into 274 ret = 0;
436 * the middle of our working buffer 275 goto done;
437 */
438 if (total_out > start_byte && buf_start < start_byte) {
439 buf_offset = start_byte - buf_start;
440 working_bytes -= buf_offset;
441 } else {
442 buf_offset = 0;
443 }
444 current_buf_start = buf_start;
445
446 /* copy bytes from the working buffer into the pages */
447 while (working_bytes > 0) {
448 bytes = min(PAGE_CACHE_SIZE - pg_offset,
449 PAGE_CACHE_SIZE - buf_offset);
450 bytes = min(bytes, working_bytes);
451 kaddr = kmap_atomic(page_out, KM_USER0);
452 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
453 bytes);
454 kunmap_atomic(kaddr, KM_USER0);
455 flush_dcache_page(page_out);
456
457 pg_offset += bytes;
458 page_bytes_left -= bytes;
459 buf_offset += bytes;
460 working_bytes -= bytes;
461 current_buf_start += bytes;
462
463 /* check if we need to pick another page */
464 if (page_bytes_left == 0) {
465 page_out_index++;
466 if (page_out_index >= vcnt) {
467 ret = 0;
468 goto done;
469 }
470
471 page_out = bvec[page_out_index].bv_page;
472 pg_offset = 0;
473 page_bytes_left = PAGE_CACHE_SIZE;
474 start_byte = page_offset(page_out) - disk_start;
475
476 /*
477 * make sure our new page is covered by this
478 * working buffer
479 */
480 if (total_out <= start_byte)
481 goto next;
482
483 /* the next page in the biovec might not
484 * be adjacent to the last page, but it
485 * might still be found inside this working
486 * buffer. bump our offset pointer
487 */
488 if (total_out > start_byte &&
489 current_buf_start < start_byte) {
490 buf_offset = start_byte - buf_start;
491 working_bytes = total_out - start_byte;
492 current_buf_start = buf_start +
493 buf_offset;
494 }
495 }
496 } 276 }
497next: 277
498 workspace->inf_strm.next_out = workspace->buf; 278 workspace->inf_strm.next_out = workspace->buf;
499 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 279 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
500 280
@@ -521,35 +301,21 @@ done:
521 zlib_inflateEnd(&workspace->inf_strm); 301 zlib_inflateEnd(&workspace->inf_strm);
522 if (data_in) 302 if (data_in)
523 kunmap(pages_in[page_in_index]); 303 kunmap(pages_in[page_in_index]);
524out:
525 free_workspace(workspace);
526 return ret; 304 return ret;
527} 305}
528 306
529/* 307static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
530 * a less complex decompression routine. Our compressed data fits in a 308 struct page *dest_page,
531 * single page, and we want to read a single page out of it. 309 unsigned long start_byte,
532 * start_byte tells us the offset into the compressed data we're interested in 310 size_t srclen, size_t destlen)
533 */
534int btrfs_zlib_decompress(unsigned char *data_in,
535 struct page *dest_page,
536 unsigned long start_byte,
537 size_t srclen, size_t destlen)
538{ 311{
312 struct workspace *workspace = list_entry(ws, struct workspace, list);
539 int ret = 0; 313 int ret = 0;
540 int wbits = MAX_WBITS; 314 int wbits = MAX_WBITS;
541 struct workspace *workspace;
542 unsigned long bytes_left = destlen; 315 unsigned long bytes_left = destlen;
543 unsigned long total_out = 0; 316 unsigned long total_out = 0;
544 char *kaddr; 317 char *kaddr;
545 318
546 if (destlen > PAGE_CACHE_SIZE)
547 return -ENOMEM;
548
549 workspace = find_zlib_workspace();
550 if (IS_ERR(workspace))
551 return -ENOMEM;
552
553 workspace->inf_strm.next_in = data_in; 319 workspace->inf_strm.next_in = data_in;
554 workspace->inf_strm.avail_in = srclen; 320 workspace->inf_strm.avail_in = srclen;
555 workspace->inf_strm.total_in = 0; 321 workspace->inf_strm.total_in = 0;
@@ -570,8 +336,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
570 336
571 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 337 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
572 printk(KERN_WARNING "inflateInit failed\n"); 338 printk(KERN_WARNING "inflateInit failed\n");
573 ret = -1; 339 return -1;
574 goto out;
575 } 340 }
576 341
577 while (bytes_left > 0) { 342 while (bytes_left > 0) {
@@ -621,12 +386,13 @@ next:
621 ret = 0; 386 ret = 0;
622 387
623 zlib_inflateEnd(&workspace->inf_strm); 388 zlib_inflateEnd(&workspace->inf_strm);
624out:
625 free_workspace(workspace);
626 return ret; 389 return ret;
627} 390}
628 391
629void btrfs_zlib_exit(void) 392struct btrfs_compress_op btrfs_zlib_compress = {
630{ 393 .alloc_workspace = zlib_alloc_workspace,
631 free_workspaces(); 394 .free_workspace = zlib_free_workspace,
632} 395 .compress_pages = zlib_compress_pages,
396 .decompress_biovec = zlib_decompress_biovec,
397 .decompress = zlib_decompress,
398};