aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/Makefile4
-rw-r--r--fs/btrfs/acl.c47
-rw-r--r--fs/btrfs/btrfs_inode.h23
-rw-r--r--fs/btrfs/compression.c419
-rw-r--r--fs/btrfs/compression.h72
-rw-r--r--fs/btrfs/ctree.c305
-rw-r--r--fs/btrfs/ctree.h474
-rw-r--r--fs/btrfs/delayed-inode.c1773
-rw-r--r--fs/btrfs/delayed-inode.h145
-rw-r--r--fs/btrfs/delayed-ref.c120
-rw-r--r--fs/btrfs/delayed-ref.h6
-rw-r--r--fs/btrfs/dir-item.c76
-rw-r--r--fs/btrfs/disk-io.c958
-rw-r--r--fs/btrfs/disk-io.h20
-rw-r--r--fs/btrfs/export.c107
-rw-r--r--fs/btrfs/extent-tree.c3159
-rw-r--r--fs/btrfs/extent_io.c806
-rw-r--r--fs/btrfs/extent_io.h67
-rw-r--r--fs/btrfs/extent_map.c20
-rw-r--r--fs/btrfs/extent_map.h7
-rw-r--r--fs/btrfs/file-item.c46
-rw-r--r--fs/btrfs/file.c937
-rw-r--r--fs/btrfs/free-space-cache.c2158
-rw-r--r--fs/btrfs/free-space-cache.h68
-rw-r--r--fs/btrfs/inode-item.c2
-rw-r--r--fs/btrfs/inode-map.c477
-rw-r--r--fs/btrfs/inode-map.h13
-rw-r--r--fs/btrfs/inode.c2207
-rw-r--r--fs/btrfs/ioctl.c1340
-rw-r--r--fs/btrfs/ioctl.h120
-rw-r--r--fs/btrfs/locking.c25
-rw-r--r--fs/btrfs/locking.h2
-rw-r--r--fs/btrfs/lzo.c427
-rw-r--r--fs/btrfs/ordered-data.c97
-rw-r--r--fs/btrfs/ordered-data.h11
-rw-r--r--fs/btrfs/orphan.c6
-rw-r--r--fs/btrfs/print-tree.c1
-rw-r--r--fs/btrfs/ref-cache.c164
-rw-r--r--fs/btrfs/ref-cache.h24
-rw-r--r--fs/btrfs/relocation.c275
-rw-r--r--fs/btrfs/root-tree.c87
-rw-r--r--fs/btrfs/scrub.c1395
-rw-r--r--fs/btrfs/super.c504
-rw-r--r--fs/btrfs/sysfs.c223
-rw-r--r--fs/btrfs/transaction.c748
-rw-r--r--fs/btrfs/transaction.h41
-rw-r--r--fs/btrfs/tree-defrag.c4
-rw-r--r--fs/btrfs/tree-log.c341
-rw-r--r--fs/btrfs/tree-log.h1
-rw-r--r--fs/btrfs/version.sh43
-rw-r--r--fs/btrfs/volumes.c1094
-rw-r--r--fs/btrfs/volumes.h43
-rw-r--r--fs/btrfs/xattr.c75
-rw-r--r--fs/btrfs/xattr.h3
-rw-r--r--fs/btrfs/zlib.c377
56 files changed, 15026 insertions, 6963 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
4 select LIBCRC32C 4 select LIBCRC32C
5 select ZLIB_INFLATE 5 select ZLIB_INFLATE
6 select ZLIB_DEFLATE 6 select ZLIB_DEFLATE
7 select LZO_COMPRESS
8 select LZO_DECOMPRESS
7 help 9 help
8 Btrfs is a new filesystem with extents, writable snapshotting, 10 Btrfs is a new filesystem with extents, writable snapshotting,
9 support for multiple devices and many more features. 11 support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..9b72dcf1cd25 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
6 transaction.o inode.o file.o tree-defrag.o \ 6 transaction.o inode.o file.o tree-defrag.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o acl.o free-space-cache.o zlib.o \ 9 export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b6..f66fc9959733 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
37 char *value = NULL; 37 char *value = NULL;
38 struct posix_acl *acl; 38 struct posix_acl *acl;
39 39
40 if (!IS_POSIXACL(inode))
41 return NULL;
42
40 acl = get_cached_acl(inode, type); 43 acl = get_cached_acl(inode, type);
41 if (acl != ACL_NOT_CACHED) 44 if (acl != ACL_NOT_CACHED)
42 return acl; 45 return acl;
@@ -60,8 +63,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
60 size = __btrfs_getxattr(inode, name, value, size); 63 size = __btrfs_getxattr(inode, name, value, size);
61 if (size > 0) { 64 if (size > 0) {
62 acl = posix_acl_from_xattr(value, size); 65 acl = posix_acl_from_xattr(value, size);
63 if (IS_ERR(acl)) 66 if (IS_ERR(acl)) {
67 kfree(value);
64 return acl; 68 return acl;
69 }
65 set_cached_acl(inode, type, acl); 70 set_cached_acl(inode, type, acl);
66 } 71 }
67 kfree(value); 72 kfree(value);
@@ -82,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
82 struct posix_acl *acl; 87 struct posix_acl *acl;
83 int ret = 0; 88 int ret = 0;
84 89
90 if (!IS_POSIXACL(dentry->d_inode))
91 return -EOPNOTSUPP;
92
85 acl = btrfs_get_acl(dentry->d_inode, type); 93 acl = btrfs_get_acl(dentry->d_inode, type);
86 94
87 if (IS_ERR(acl)) 95 if (IS_ERR(acl))
@@ -162,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
162 int ret; 170 int ret;
163 struct posix_acl *acl = NULL; 171 struct posix_acl *acl = NULL;
164 172
165 if (!is_owner_or_cap(dentry->d_inode)) 173 if (!inode_owner_or_capable(dentry->d_inode))
166 return -EPERM; 174 return -EPERM;
167 175
168 if (!IS_POSIXACL(dentry->d_inode)) 176 if (!IS_POSIXACL(dentry->d_inode))
@@ -170,33 +178,40 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
170 178
171 if (value) { 179 if (value) {
172 acl = posix_acl_from_xattr(value, size); 180 acl = posix_acl_from_xattr(value, size);
173 if (acl == NULL) { 181 if (IS_ERR(acl))
174 value = NULL;
175 size = 0;
176 } else if (IS_ERR(acl)) {
177 return PTR_ERR(acl); 182 return PTR_ERR(acl);
183
184 if (acl) {
185 ret = posix_acl_valid(acl);
186 if (ret)
187 goto out;
178 } 188 }
179 } 189 }
180 190
181 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); 191 ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
182 192out:
183 posix_acl_release(acl); 193 posix_acl_release(acl);
184 194
185 return ret; 195 return ret;
186} 196}
187 197
188int btrfs_check_acl(struct inode *inode, int mask) 198int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
189{ 199{
190 struct posix_acl *acl;
191 int error = -EAGAIN; 200 int error = -EAGAIN;
192 201
193 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); 202 if (flags & IPERM_FLAG_RCU) {
203 if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
204 error = -ECHILD;
194 205
195 if (IS_ERR(acl)) 206 } else {
196 return PTR_ERR(acl); 207 struct posix_acl *acl;
197 if (acl) { 208 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
198 error = posix_acl_permission(inode, acl, mask); 209 if (IS_ERR(acl))
199 posix_acl_release(acl); 210 return PTR_ERR(acl);
211 if (acl) {
212 error = posix_acl_permission(inode, acl, mask);
213 posix_acl_release(acl);
214 }
200 } 215 }
201 216
202 return error; 217 return error;
@@ -273,7 +288,7 @@ int btrfs_acl_chmod(struct inode *inode)
273 return 0; 288 return 0;
274 289
275 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); 290 acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
276 if (IS_ERR(acl) || !acl) 291 if (IS_ERR_OR_NULL(acl))
277 return PTR_ERR(acl); 292 return PTR_ERR(acl);
278 293
279 clone = posix_acl_clone(acl, GFP_KERNEL); 294 clone = posix_acl_clone(acl, GFP_KERNEL);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..52d7eca8c7bf 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -22,6 +22,7 @@
22#include "extent_map.h" 22#include "extent_map.h"
23#include "extent_io.h" 23#include "extent_io.h"
24#include "ordered-data.h" 24#include "ordered-data.h"
25#include "delayed-inode.h"
25 26
26/* in memory btrfs inode */ 27/* in memory btrfs inode */
27struct btrfs_inode { 28struct btrfs_inode {
@@ -120,9 +121,6 @@ struct btrfs_inode {
120 */ 121 */
121 u64 index_cnt; 122 u64 index_cnt;
122 123
123 /* the start of block group preferred for allocations. */
124 u64 block_group;
125
126 /* the fsync log has some corner cases that mean we have to check 124 /* the fsync log has some corner cases that mean we have to check
127 * directories to see if any unlinks have been done before 125 * directories to see if any unlinks have been done before
128 * the directory was logged. See tree-log.c for all the 126 * the directory was logged. See tree-log.c for all the
@@ -136,9 +134,8 @@ struct btrfs_inode {
136 * items we think we'll end up using, and reserved_extents is the number 134 * items we think we'll end up using, and reserved_extents is the number
137 * of extent items we've reserved metadata for. 135 * of extent items we've reserved metadata for.
138 */ 136 */
139 spinlock_t accounting_lock;
140 atomic_t outstanding_extents; 137 atomic_t outstanding_extents;
141 int reserved_extents; 138 atomic_t reserved_extents;
142 139
143 /* 140 /*
144 * ordered_data_close is set by truncate when a file that used 141 * ordered_data_close is set by truncate when a file that used
@@ -153,20 +150,34 @@ struct btrfs_inode {
153 unsigned ordered_data_close:1; 150 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 151 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 152 unsigned dummy_inode:1;
153 unsigned in_defrag:1;
156 154
157 /* 155 /*
158 * always compress this one file 156 * always compress this one file
159 */ 157 */
160 unsigned force_compress:1; 158 unsigned force_compress:4;
159
160 struct btrfs_delayed_node *delayed_node;
161 161
162 struct inode vfs_inode; 162 struct inode vfs_inode;
163}; 163};
164 164
165extern unsigned char btrfs_filetype_table[];
166
165static inline struct btrfs_inode *BTRFS_I(struct inode *inode) 167static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
166{ 168{
167 return container_of(inode, struct btrfs_inode, vfs_inode); 169 return container_of(inode, struct btrfs_inode, vfs_inode);
168} 170}
169 171
172static inline u64 btrfs_ino(struct inode *inode)
173{
174 u64 ino = BTRFS_I(inode)->location.objectid;
175
176 if (ino <= BTRFS_FIRST_FREE_OBJECTID)
177 ino = inode->i_ino;
178 return ino;
179}
180
170static inline void btrfs_i_size_write(struct inode *inode, u64 size) 181static inline void btrfs_i_size_write(struct inode *inode, u64 size)
171{ 182{
172 i_size_write(inode, size); 183 i_size_write(inode, size);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 396039b3a8a2..bfe42b03eaf9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
62 /* number of bytes on disk */ 62 /* number of bytes on disk */
63 unsigned long compressed_len; 63 unsigned long compressed_len;
64 64
65 /* the compression algorithm for this bio */
66 int compress_type;
67
65 /* number of compressed pages in the array */ 68 /* number of compressed pages in the array */
66 unsigned long nr_pages; 69 unsigned long nr_pages;
67 70
@@ -91,23 +94,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
91static struct bio *compressed_bio_alloc(struct block_device *bdev, 94static struct bio *compressed_bio_alloc(struct block_device *bdev,
92 u64 first_byte, gfp_t gfp_flags) 95 u64 first_byte, gfp_t gfp_flags)
93{ 96{
94 struct bio *bio;
95 int nr_vecs; 97 int nr_vecs;
96 98
97 nr_vecs = bio_get_nr_vecs(bdev); 99 nr_vecs = bio_get_nr_vecs(bdev);
98 bio = bio_alloc(gfp_flags, nr_vecs); 100 return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
99
100 if (bio == NULL && (current->flags & PF_MEMALLOC)) {
101 while (!bio && (nr_vecs /= 2))
102 bio = bio_alloc(gfp_flags, nr_vecs);
103 }
104
105 if (bio) {
106 bio->bi_size = 0;
107 bio->bi_bdev = bdev;
108 bio->bi_sector = first_byte >> 9;
109 }
110 return bio;
111} 101}
112 102
113static int check_compressed_csum(struct inode *inode, 103static int check_compressed_csum(struct inode *inode,
@@ -135,9 +125,10 @@ static int check_compressed_csum(struct inode *inode,
135 kunmap_atomic(kaddr, KM_USER0); 125 kunmap_atomic(kaddr, KM_USER0);
136 126
137 if (csum != *cb_sum) { 127 if (csum != *cb_sum) {
138 printk(KERN_INFO "btrfs csum failed ino %lu " 128 printk(KERN_INFO "btrfs csum failed ino %llu "
139 "extent %llu csum %u " 129 "extent %llu csum %u "
140 "wanted %u mirror %d\n", inode->i_ino, 130 "wanted %u mirror %d\n",
131 (unsigned long long)btrfs_ino(inode),
141 (unsigned long long)disk_start, 132 (unsigned long long)disk_start,
142 csum, *cb_sum, cb->mirror_num); 133 csum, *cb_sum, cb->mirror_num);
143 ret = -EIO; 134 ret = -EIO;
@@ -163,7 +154,6 @@ fail:
163 */ 154 */
164static void end_compressed_bio_read(struct bio *bio, int err) 155static void end_compressed_bio_read(struct bio *bio, int err)
165{ 156{
166 struct extent_io_tree *tree;
167 struct compressed_bio *cb = bio->bi_private; 157 struct compressed_bio *cb = bio->bi_private;
168 struct inode *inode; 158 struct inode *inode;
169 struct page *page; 159 struct page *page;
@@ -187,12 +177,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
187 /* ok, we're the last bio for this extent, lets start 177 /* ok, we're the last bio for this extent, lets start
188 * the decompression. 178 * the decompression.
189 */ 179 */
190 tree = &BTRFS_I(inode)->io_tree; 180 ret = btrfs_decompress_biovec(cb->compress_type,
191 ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, 181 cb->compressed_pages,
192 cb->start, 182 cb->start,
193 cb->orig_bio->bi_io_vec, 183 cb->orig_bio->bi_io_vec,
194 cb->orig_bio->bi_vcnt, 184 cb->orig_bio->bi_vcnt,
195 cb->compressed_len); 185 cb->compressed_len);
196csum_failed: 186csum_failed:
197 if (ret) 187 if (ret)
198 cb->errors = 1; 188 cb->errors = 1;
@@ -343,7 +333,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
343 struct compressed_bio *cb; 333 struct compressed_bio *cb;
344 unsigned long bytes_left; 334 unsigned long bytes_left;
345 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 335 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
346 int page_index = 0; 336 int pg_index = 0;
347 struct page *page; 337 struct page *page;
348 u64 first_byte = disk_start; 338 u64 first_byte = disk_start;
349 struct block_device *bdev; 339 struct block_device *bdev;
@@ -351,6 +341,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
351 341
352 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); 342 WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
353 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 343 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
344 if (!cb)
345 return -ENOMEM;
354 atomic_set(&cb->pending_bios, 0); 346 atomic_set(&cb->pending_bios, 0);
355 cb->errors = 0; 347 cb->errors = 0;
356 cb->inode = inode; 348 cb->inode = inode;
@@ -365,14 +357,18 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
365 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 357 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
366 358
367 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); 359 bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
360 if(!bio) {
361 kfree(cb);
362 return -ENOMEM;
363 }
368 bio->bi_private = cb; 364 bio->bi_private = cb;
369 bio->bi_end_io = end_compressed_bio_write; 365 bio->bi_end_io = end_compressed_bio_write;
370 atomic_inc(&cb->pending_bios); 366 atomic_inc(&cb->pending_bios);
371 367
372 /* create and submit bios for the compressed pages */ 368 /* create and submit bios for the compressed pages */
373 bytes_left = compressed_len; 369 bytes_left = compressed_len;
374 for (page_index = 0; page_index < cb->nr_pages; page_index++) { 370 for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
375 page = compressed_pages[page_index]; 371 page = compressed_pages[pg_index];
376 page->mapping = inode->i_mapping; 372 page->mapping = inode->i_mapping;
377 if (bio->bi_size) 373 if (bio->bi_size)
378 ret = io_tree->ops->merge_bio_hook(page, 0, 374 ret = io_tree->ops->merge_bio_hook(page, 0,
@@ -437,7 +433,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
437 struct compressed_bio *cb) 433 struct compressed_bio *cb)
438{ 434{
439 unsigned long end_index; 435 unsigned long end_index;
440 unsigned long page_index; 436 unsigned long pg_index;
441 u64 last_offset; 437 u64 last_offset;
442 u64 isize = i_size_read(inode); 438 u64 isize = i_size_read(inode);
443 int ret; 439 int ret;
@@ -461,13 +457,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
461 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; 457 end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
462 458
463 while (last_offset < compressed_end) { 459 while (last_offset < compressed_end) {
464 page_index = last_offset >> PAGE_CACHE_SHIFT; 460 pg_index = last_offset >> PAGE_CACHE_SHIFT;
465 461
466 if (page_index > end_index) 462 if (pg_index > end_index)
467 break; 463 break;
468 464
469 rcu_read_lock(); 465 rcu_read_lock();
470 page = radix_tree_lookup(&mapping->page_tree, page_index); 466 page = radix_tree_lookup(&mapping->page_tree, pg_index);
471 rcu_read_unlock(); 467 rcu_read_unlock();
472 if (page) { 468 if (page) {
473 misses++; 469 misses++;
@@ -481,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
481 if (!page) 477 if (!page)
482 break; 478 break;
483 479
484 if (add_to_page_cache_lru(page, mapping, page_index, 480 if (add_to_page_cache_lru(page, mapping, pg_index,
485 GFP_NOFS)) { 481 GFP_NOFS)) {
486 page_cache_release(page); 482 page_cache_release(page);
487 goto next; 483 goto next;
@@ -565,7 +561,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
565 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; 561 unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
566 unsigned long compressed_len; 562 unsigned long compressed_len;
567 unsigned long nr_pages; 563 unsigned long nr_pages;
568 unsigned long page_index; 564 unsigned long pg_index;
569 struct page *page; 565 struct page *page;
570 struct block_device *bdev; 566 struct block_device *bdev;
571 struct bio *comp_bio; 567 struct bio *comp_bio;
@@ -573,7 +569,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
573 u64 em_len; 569 u64 em_len;
574 u64 em_start; 570 u64 em_start;
575 struct extent_map *em; 571 struct extent_map *em;
576 int ret; 572 int ret = -ENOMEM;
577 u32 *sums; 573 u32 *sums;
578 574
579 tree = &BTRFS_I(inode)->io_tree; 575 tree = &BTRFS_I(inode)->io_tree;
@@ -588,6 +584,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
588 584
589 compressed_len = em->block_len; 585 compressed_len = em->block_len;
590 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); 586 cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
587 if (!cb)
588 goto out;
589
591 atomic_set(&cb->pending_bios, 0); 590 atomic_set(&cb->pending_bios, 0);
592 cb->errors = 0; 591 cb->errors = 0;
593 cb->inode = inode; 592 cb->inode = inode;
@@ -603,17 +602,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
603 602
604 cb->len = uncompressed_len; 603 cb->len = uncompressed_len;
605 cb->compressed_len = compressed_len; 604 cb->compressed_len = compressed_len;
605 cb->compress_type = extent_compress_type(bio_flags);
606 cb->orig_bio = bio; 606 cb->orig_bio = bio;
607 607
608 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / 608 nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
609 PAGE_CACHE_SIZE; 609 PAGE_CACHE_SIZE;
610 cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, 610 cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
611 GFP_NOFS); 611 GFP_NOFS);
612 if (!cb->compressed_pages)
613 goto fail1;
614
612 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; 615 bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
613 616
614 for (page_index = 0; page_index < nr_pages; page_index++) { 617 for (pg_index = 0; pg_index < nr_pages; pg_index++) {
615 cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | 618 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
616 __GFP_HIGHMEM); 619 __GFP_HIGHMEM);
620 if (!cb->compressed_pages[pg_index])
621 goto fail2;
617 } 622 }
618 cb->nr_pages = nr_pages; 623 cb->nr_pages = nr_pages;
619 624
@@ -624,12 +629,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
624 cb->len = uncompressed_len; 629 cb->len = uncompressed_len;
625 630
626 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); 631 comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
632 if (!comp_bio)
633 goto fail2;
627 comp_bio->bi_private = cb; 634 comp_bio->bi_private = cb;
628 comp_bio->bi_end_io = end_compressed_bio_read; 635 comp_bio->bi_end_io = end_compressed_bio_read;
629 atomic_inc(&cb->pending_bios); 636 atomic_inc(&cb->pending_bios);
630 637
631 for (page_index = 0; page_index < nr_pages; page_index++) { 638 for (pg_index = 0; pg_index < nr_pages; pg_index++) {
632 page = cb->compressed_pages[page_index]; 639 page = cb->compressed_pages[pg_index];
633 page->mapping = inode->i_mapping; 640 page->mapping = inode->i_mapping;
634 page->index = em_start >> PAGE_CACHE_SHIFT; 641 page->index = em_start >> PAGE_CACHE_SHIFT;
635 642
@@ -657,8 +664,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
657 atomic_inc(&cb->pending_bios); 664 atomic_inc(&cb->pending_bios);
658 665
659 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { 666 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
660 btrfs_lookup_bio_sums(root, inode, comp_bio, 667 ret = btrfs_lookup_bio_sums(root, inode,
661 sums); 668 comp_bio, sums);
669 BUG_ON(ret);
662 } 670 }
663 sums += (comp_bio->bi_size + root->sectorsize - 1) / 671 sums += (comp_bio->bi_size + root->sectorsize - 1) /
664 root->sectorsize; 672 root->sectorsize;
@@ -683,12 +691,339 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
683 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); 691 ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
684 BUG_ON(ret); 692 BUG_ON(ret);
685 693
686 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) 694 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
687 btrfs_lookup_bio_sums(root, inode, comp_bio, sums); 695 ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
696 BUG_ON(ret);
697 }
688 698
689 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); 699 ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
690 BUG_ON(ret); 700 BUG_ON(ret);
691 701
692 bio_put(comp_bio); 702 bio_put(comp_bio);
693 return 0; 703 return 0;
704
705fail2:
706 for (pg_index = 0; pg_index < nr_pages; pg_index++)
707 free_page((unsigned long)cb->compressed_pages[pg_index]);
708
709 kfree(cb->compressed_pages);
710fail1:
711 kfree(cb);
712out:
713 free_extent_map(em);
714 return ret;
715}
716
717static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
718static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
719static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
720static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
721static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
722
723struct btrfs_compress_op *btrfs_compress_op[] = {
724 &btrfs_zlib_compress,
725 &btrfs_lzo_compress,
726};
727
728int __init btrfs_init_compress(void)
729{
730 int i;
731
732 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
733 INIT_LIST_HEAD(&comp_idle_workspace[i]);
734 spin_lock_init(&comp_workspace_lock[i]);
735 atomic_set(&comp_alloc_workspace[i], 0);
736 init_waitqueue_head(&comp_workspace_wait[i]);
737 }
738 return 0;
739}
740
741/*
742 * this finds an available workspace or allocates a new one
743 * ERR_PTR is returned if things go bad.
744 */
745static struct list_head *find_workspace(int type)
746{
747 struct list_head *workspace;
748 int cpus = num_online_cpus();
749 int idx = type - 1;
750
751 struct list_head *idle_workspace = &comp_idle_workspace[idx];
752 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
753 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
754 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
755 int *num_workspace = &comp_num_workspace[idx];
756again:
757 spin_lock(workspace_lock);
758 if (!list_empty(idle_workspace)) {
759 workspace = idle_workspace->next;
760 list_del(workspace);
761 (*num_workspace)--;
762 spin_unlock(workspace_lock);
763 return workspace;
764
765 }
766 if (atomic_read(alloc_workspace) > cpus) {
767 DEFINE_WAIT(wait);
768
769 spin_unlock(workspace_lock);
770 prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
771 if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
772 schedule();
773 finish_wait(workspace_wait, &wait);
774 goto again;
775 }
776 atomic_inc(alloc_workspace);
777 spin_unlock(workspace_lock);
778
779 workspace = btrfs_compress_op[idx]->alloc_workspace();
780 if (IS_ERR(workspace)) {
781 atomic_dec(alloc_workspace);
782 wake_up(workspace_wait);
783 }
784 return workspace;
785}
786
787/*
788 * put a workspace struct back on the list or free it if we have enough
789 * idle ones sitting around
790 */
791static void free_workspace(int type, struct list_head *workspace)
792{
793 int idx = type - 1;
794 struct list_head *idle_workspace = &comp_idle_workspace[idx];
795 spinlock_t *workspace_lock = &comp_workspace_lock[idx];
796 atomic_t *alloc_workspace = &comp_alloc_workspace[idx];
797 wait_queue_head_t *workspace_wait = &comp_workspace_wait[idx];
798 int *num_workspace = &comp_num_workspace[idx];
799
800 spin_lock(workspace_lock);
801 if (*num_workspace < num_online_cpus()) {
802 list_add_tail(workspace, idle_workspace);
803 (*num_workspace)++;
804 spin_unlock(workspace_lock);
805 goto wake;
806 }
807 spin_unlock(workspace_lock);
808
809 btrfs_compress_op[idx]->free_workspace(workspace);
810 atomic_dec(alloc_workspace);
811wake:
812 if (waitqueue_active(workspace_wait))
813 wake_up(workspace_wait);
814}
815
816/*
817 * cleanup function for module exit
818 */
819static void free_workspaces(void)
820{
821 struct list_head *workspace;
822 int i;
823
824 for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
825 while (!list_empty(&comp_idle_workspace[i])) {
826 workspace = comp_idle_workspace[i].next;
827 list_del(workspace);
828 btrfs_compress_op[i]->free_workspace(workspace);
829 atomic_dec(&comp_alloc_workspace[i]);
830 }
831 }
832}
833
834/*
835 * given an address space and start/len, compress the bytes.
836 *
837 * pages are allocated to hold the compressed result and stored
838 * in 'pages'
839 *
840 * out_pages is used to return the number of pages allocated. There
841 * may be pages allocated even if we return an error
842 *
843 * total_in is used to return the number of bytes actually read. It
844 * may be smaller then len if we had to exit early because we
845 * ran out of room in the pages array or because we cross the
846 * max_out threshold.
847 *
848 * total_out is used to return the total number of compressed bytes
849 *
850 * max_out tells us the max number of bytes that we're allowed to
851 * stuff into pages
852 */
853int btrfs_compress_pages(int type, struct address_space *mapping,
854 u64 start, unsigned long len,
855 struct page **pages,
856 unsigned long nr_dest_pages,
857 unsigned long *out_pages,
858 unsigned long *total_in,
859 unsigned long *total_out,
860 unsigned long max_out)
861{
862 struct list_head *workspace;
863 int ret;
864
865 workspace = find_workspace(type);
866 if (IS_ERR(workspace))
867 return -1;
868
869 ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
870 start, len, pages,
871 nr_dest_pages, out_pages,
872 total_in, total_out,
873 max_out);
874 free_workspace(type, workspace);
875 return ret;
876}
877
878/*
879 * pages_in is an array of pages with compressed data.
880 *
881 * disk_start is the starting logical offset of this array in the file
882 *
883 * bvec is a bio_vec of pages from the file that we want to decompress into
884 *
885 * vcnt is the count of pages in the biovec
886 *
887 * srclen is the number of bytes in pages_in
888 *
889 * The basic idea is that we have a bio that was created by readpages.
890 * The pages in the bio are for the uncompressed data, and they may not
891 * be contiguous. They all correspond to the range of bytes covered by
892 * the compressed extent.
893 */
894int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
895 struct bio_vec *bvec, int vcnt, size_t srclen)
896{
897 struct list_head *workspace;
898 int ret;
899
900 workspace = find_workspace(type);
901 if (IS_ERR(workspace))
902 return -ENOMEM;
903
904 ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
905 disk_start,
906 bvec, vcnt, srclen);
907 free_workspace(type, workspace);
908 return ret;
909}
910
911/*
912 * a less complex decompression routine. Our compressed data fits in a
913 * single page, and we want to read a single page out of it.
914 * start_byte tells us the offset into the compressed data we're interested in
915 */
916int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
917 unsigned long start_byte, size_t srclen, size_t destlen)
918{
919 struct list_head *workspace;
920 int ret;
921
922 workspace = find_workspace(type);
923 if (IS_ERR(workspace))
924 return -ENOMEM;
925
926 ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
927 dest_page, start_byte,
928 srclen, destlen);
929
930 free_workspace(type, workspace);
931 return ret;
932}
933
934void btrfs_exit_compress(void)
935{
936 free_workspaces();
937}
938
939/*
940 * Copy uncompressed data from working buffer to pages.
941 *
942 * buf_start is the byte offset we're of the start of our workspace buffer.
943 *
944 * total_out is the last byte of the buffer
945 */
946int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
947 unsigned long total_out, u64 disk_start,
948 struct bio_vec *bvec, int vcnt,
949 unsigned long *pg_index,
950 unsigned long *pg_offset)
951{
952 unsigned long buf_offset;
953 unsigned long current_buf_start;
954 unsigned long start_byte;
955 unsigned long working_bytes = total_out - buf_start;
956 unsigned long bytes;
957 char *kaddr;
958 struct page *page_out = bvec[*pg_index].bv_page;
959
960 /*
961 * start byte is the first byte of the page we're currently
962 * copying into relative to the start of the compressed data.
963 */
964 start_byte = page_offset(page_out) - disk_start;
965
966 /* we haven't yet hit data corresponding to this page */
967 if (total_out <= start_byte)
968 return 1;
969
970 /*
971 * the start of the data we care about is offset into
972 * the middle of our working buffer
973 */
974 if (total_out > start_byte && buf_start < start_byte) {
975 buf_offset = start_byte - buf_start;
976 working_bytes -= buf_offset;
977 } else {
978 buf_offset = 0;
979 }
980 current_buf_start = buf_start;
981
982 /* copy bytes from the working buffer into the pages */
983 while (working_bytes > 0) {
984 bytes = min(PAGE_CACHE_SIZE - *pg_offset,
985 PAGE_CACHE_SIZE - buf_offset);
986 bytes = min(bytes, working_bytes);
987 kaddr = kmap_atomic(page_out, KM_USER0);
988 memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
989 kunmap_atomic(kaddr, KM_USER0);
990 flush_dcache_page(page_out);
991
992 *pg_offset += bytes;
993 buf_offset += bytes;
994 working_bytes -= bytes;
995 current_buf_start += bytes;
996
997 /* check if we need to pick another page */
998 if (*pg_offset == PAGE_CACHE_SIZE) {
999 (*pg_index)++;
1000 if (*pg_index >= vcnt)
1001 return 0;
1002
1003 page_out = bvec[*pg_index].bv_page;
1004 *pg_offset = 0;
1005 start_byte = page_offset(page_out) - disk_start;
1006
1007 /*
1008 * make sure our new page is covered by this
1009 * working buffer
1010 */
1011 if (total_out <= start_byte)
1012 return 1;
1013
1014 /*
1015 * the next page in the biovec might not be adjacent
1016 * to the last page, but it might still be found
1017 * inside this working buffer. bump our offset pointer
1018 */
1019 if (total_out > start_byte &&
1020 current_buf_start < start_byte) {
1021 buf_offset = start_byte - buf_start;
1022 working_bytes = total_out - start_byte;
1023 current_buf_start = buf_start + buf_offset;
1024 }
1025 }
1026 }
1027
1028 return 1;
694} 1029}
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..a12059f4f0fd 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
19#ifndef __BTRFS_COMPRESSION_ 19#ifndef __BTRFS_COMPRESSION_
20#define __BTRFS_COMPRESSION_ 20#define __BTRFS_COMPRESSION_
21 21
22int btrfs_zlib_decompress(unsigned char *data_in, 22int btrfs_init_compress(void);
23 struct page *dest_page, 23void btrfs_exit_compress(void);
24 unsigned long start_byte, 24
25 size_t srclen, size_t destlen); 25int btrfs_compress_pages(int type, struct address_space *mapping,
26int btrfs_zlib_compress_pages(struct address_space *mapping, 26 u64 start, unsigned long len,
27 u64 start, unsigned long len, 27 struct page **pages,
28 struct page **pages, 28 unsigned long nr_dest_pages,
29 unsigned long nr_dest_pages, 29 unsigned long *out_pages,
30 unsigned long *out_pages, 30 unsigned long *total_in,
31 unsigned long *total_in, 31 unsigned long *total_out,
32 unsigned long *total_out, 32 unsigned long max_out);
33 unsigned long max_out); 33int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
34int btrfs_zlib_decompress_biovec(struct page **pages_in, 34 struct bio_vec *bvec, int vcnt, size_t srclen);
35 u64 disk_start, 35int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
36 struct bio_vec *bvec, 36 unsigned long start_byte, size_t srclen, size_t destlen);
37 int vcnt, 37int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
38 size_t srclen); 38 unsigned long total_out, u64 disk_start,
39void btrfs_zlib_exit(void); 39 struct bio_vec *bvec, int vcnt,
40 unsigned long *pg_index,
41 unsigned long *pg_offset);
42
40int btrfs_submit_compressed_write(struct inode *inode, u64 start, 43int btrfs_submit_compressed_write(struct inode *inode, u64 start,
41 unsigned long len, u64 disk_start, 44 unsigned long len, u64 disk_start,
42 unsigned long compressed_len, 45 unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
44 unsigned long nr_pages); 47 unsigned long nr_pages);
45int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, 48int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
46 int mirror_num, unsigned long bio_flags); 49 int mirror_num, unsigned long bio_flags);
50
51struct btrfs_compress_op {
52 struct list_head *(*alloc_workspace)(void);
53
54 void (*free_workspace)(struct list_head *workspace);
55
56 int (*compress_pages)(struct list_head *workspace,
57 struct address_space *mapping,
58 u64 start, unsigned long len,
59 struct page **pages,
60 unsigned long nr_dest_pages,
61 unsigned long *out_pages,
62 unsigned long *total_in,
63 unsigned long *total_out,
64 unsigned long max_out);
65
66 int (*decompress_biovec)(struct list_head *workspace,
67 struct page **pages_in,
68 u64 disk_start,
69 struct bio_vec *bvec,
70 int vcnt,
71 size_t srclen);
72
73 int (*decompress)(struct list_head *workspace,
74 unsigned char *data_in,
75 struct page *dest_page,
76 unsigned long start_byte,
77 size_t srclen, size_t destlen);
78};
79
80extern struct btrfs_compress_op btrfs_zlib_compress;
81extern struct btrfs_compress_op btrfs_lzo_compress;
82
47#endif 83#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3df14ce2cc2..2e667868e0d2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,18 +38,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
38 struct extent_buffer *src_buf); 38 struct extent_buffer *src_buf);
39static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, 39static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
40 struct btrfs_path *path, int level, int slot); 40 struct btrfs_path *path, int level, int slot);
41static int setup_items_for_insert(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, struct btrfs_path *path,
43 struct btrfs_key *cpu_key, u32 *data_size,
44 u32 total_data, u32 total_size, int nr);
45
46 41
47struct btrfs_path *btrfs_alloc_path(void) 42struct btrfs_path *btrfs_alloc_path(void)
48{ 43{
49 struct btrfs_path *path; 44 struct btrfs_path *path;
50 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); 45 path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
51 if (path)
52 path->reada = 1;
53 return path; 46 return path;
54} 47}
55 48
@@ -105,7 +98,9 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
105/* this also releases the path */ 98/* this also releases the path */
106void btrfs_free_path(struct btrfs_path *p) 99void btrfs_free_path(struct btrfs_path *p)
107{ 100{
108 btrfs_release_path(NULL, p); 101 if (!p)
102 return;
103 btrfs_release_path(p);
109 kmem_cache_free(btrfs_path_cachep, p); 104 kmem_cache_free(btrfs_path_cachep, p);
110} 105}
111 106
@@ -115,7 +110,7 @@ void btrfs_free_path(struct btrfs_path *p)
115 * 110 *
116 * It is safe to call this on paths that no locks or extent buffers held. 111 * It is safe to call this on paths that no locks or extent buffers held.
117 */ 112 */
118noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) 113noinline void btrfs_release_path(struct btrfs_path *p)
119{ 114{
120 int i; 115 int i;
121 116
@@ -145,10 +140,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
145struct extent_buffer *btrfs_root_node(struct btrfs_root *root) 140struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
146{ 141{
147 struct extent_buffer *eb; 142 struct extent_buffer *eb;
148 spin_lock(&root->node_lock); 143
149 eb = root->node; 144 rcu_read_lock();
145 eb = rcu_dereference(root->node);
150 extent_buffer_get(eb); 146 extent_buffer_get(eb);
151 spin_unlock(&root->node_lock); 147 rcu_read_unlock();
152 return eb; 148 return eb;
153} 149}
154 150
@@ -163,14 +159,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
163 while (1) { 159 while (1) {
164 eb = btrfs_root_node(root); 160 eb = btrfs_root_node(root);
165 btrfs_tree_lock(eb); 161 btrfs_tree_lock(eb);
166 162 if (eb == root->node)
167 spin_lock(&root->node_lock);
168 if (eb == root->node) {
169 spin_unlock(&root->node_lock);
170 break; 163 break;
171 }
172 spin_unlock(&root->node_lock);
173
174 btrfs_tree_unlock(eb); 164 btrfs_tree_unlock(eb);
175 free_extent_buffer(eb); 165 free_extent_buffer(eb);
176 } 166 }
@@ -200,7 +190,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
200 struct extent_buffer **cow_ret, u64 new_root_objectid) 190 struct extent_buffer **cow_ret, u64 new_root_objectid)
201{ 191{
202 struct extent_buffer *cow; 192 struct extent_buffer *cow;
203 u32 nritems;
204 int ret = 0; 193 int ret = 0;
205 int level; 194 int level;
206 struct btrfs_disk_key disk_key; 195 struct btrfs_disk_key disk_key;
@@ -210,7 +199,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
210 WARN_ON(root->ref_cows && trans->transid != root->last_trans); 199 WARN_ON(root->ref_cows && trans->transid != root->last_trans);
211 200
212 level = btrfs_header_level(buf); 201 level = btrfs_header_level(buf);
213 nritems = btrfs_header_nritems(buf);
214 if (level == 0) 202 if (level == 0)
215 btrfs_item_key(buf, &disk_key, 0); 203 btrfs_item_key(buf, &disk_key, 0);
216 else 204 else
@@ -458,10 +446,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
458 else 446 else
459 parent_start = 0; 447 parent_start = 0;
460 448
461 spin_lock(&root->node_lock);
462 root->node = cow;
463 extent_buffer_get(cow); 449 extent_buffer_get(cow);
464 spin_unlock(&root->node_lock); 450 rcu_assign_pointer(root->node, cow);
465 451
466 btrfs_free_tree_block(trans, root, buf, parent_start, 452 btrfs_free_tree_block(trans, root, buf, parent_start,
467 last_ref); 453 last_ref);
@@ -542,6 +528,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
542 528
543 ret = __btrfs_cow_block(trans, root, buf, parent, 529 ret = __btrfs_cow_block(trans, root, buf, parent,
544 parent_slot, cow_ret, search_start, 0); 530 parent_slot, cow_ret, search_start, 0);
531
532 trace_btrfs_cow_block(root, buf, *cow_ret);
533
545 return ret; 534 return ret;
546} 535}
547 536
@@ -686,6 +675,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
686 if (!cur) { 675 if (!cur) {
687 cur = read_tree_block(root, blocknr, 676 cur = read_tree_block(root, blocknr,
688 blocksize, gen); 677 blocksize, gen);
678 if (!cur)
679 return -EIO;
689 } else if (!uptodate) { 680 } else if (!uptodate) {
690 btrfs_read_buffer(cur, gen); 681 btrfs_read_buffer(cur, gen);
691 } 682 }
@@ -732,122 +723,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
732 return btrfs_item_offset_nr(leaf, nr - 1); 723 return btrfs_item_offset_nr(leaf, nr - 1);
733} 724}
734 725
735/*
736 * extra debugging checks to make sure all the items in a key are
737 * well formed and in the proper order
738 */
739static int check_node(struct btrfs_root *root, struct btrfs_path *path,
740 int level)
741{
742 struct extent_buffer *parent = NULL;
743 struct extent_buffer *node = path->nodes[level];
744 struct btrfs_disk_key parent_key;
745 struct btrfs_disk_key node_key;
746 int parent_slot;
747 int slot;
748 struct btrfs_key cpukey;
749 u32 nritems = btrfs_header_nritems(node);
750
751 if (path->nodes[level + 1])
752 parent = path->nodes[level + 1];
753
754 slot = path->slots[level];
755 BUG_ON(nritems == 0);
756 if (parent) {
757 parent_slot = path->slots[level + 1];
758 btrfs_node_key(parent, &parent_key, parent_slot);
759 btrfs_node_key(node, &node_key, 0);
760 BUG_ON(memcmp(&parent_key, &node_key,
761 sizeof(struct btrfs_disk_key)));
762 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
763 btrfs_header_bytenr(node));
764 }
765 BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
766 if (slot != 0) {
767 btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
768 btrfs_node_key(node, &node_key, slot);
769 BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
770 }
771 if (slot < nritems - 1) {
772 btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
773 btrfs_node_key(node, &node_key, slot);
774 BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
775 }
776 return 0;
777}
778
779/*
780 * extra checking to make sure all the items in a leaf are
781 * well formed and in the proper order
782 */
783static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
784 int level)
785{
786 struct extent_buffer *leaf = path->nodes[level];
787 struct extent_buffer *parent = NULL;
788 int parent_slot;
789 struct btrfs_key cpukey;
790 struct btrfs_disk_key parent_key;
791 struct btrfs_disk_key leaf_key;
792 int slot = path->slots[0];
793
794 u32 nritems = btrfs_header_nritems(leaf);
795
796 if (path->nodes[level + 1])
797 parent = path->nodes[level + 1];
798
799 if (nritems == 0)
800 return 0;
801
802 if (parent) {
803 parent_slot = path->slots[level + 1];
804 btrfs_node_key(parent, &parent_key, parent_slot);
805 btrfs_item_key(leaf, &leaf_key, 0);
806
807 BUG_ON(memcmp(&parent_key, &leaf_key,
808 sizeof(struct btrfs_disk_key)));
809 BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
810 btrfs_header_bytenr(leaf));
811 }
812 if (slot != 0 && slot < nritems - 1) {
813 btrfs_item_key(leaf, &leaf_key, slot);
814 btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
815 if (comp_keys(&leaf_key, &cpukey) <= 0) {
816 btrfs_print_leaf(root, leaf);
817 printk(KERN_CRIT "slot %d offset bad key\n", slot);
818 BUG_ON(1);
819 }
820 if (btrfs_item_offset_nr(leaf, slot - 1) !=
821 btrfs_item_end_nr(leaf, slot)) {
822 btrfs_print_leaf(root, leaf);
823 printk(KERN_CRIT "slot %d offset bad\n", slot);
824 BUG_ON(1);
825 }
826 }
827 if (slot < nritems - 1) {
828 btrfs_item_key(leaf, &leaf_key, slot);
829 btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
830 BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
831 if (btrfs_item_offset_nr(leaf, slot) !=
832 btrfs_item_end_nr(leaf, slot + 1)) {
833 btrfs_print_leaf(root, leaf);
834 printk(KERN_CRIT "slot %d offset bad\n", slot);
835 BUG_ON(1);
836 }
837 }
838 BUG_ON(btrfs_item_offset_nr(leaf, 0) +
839 btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
840 return 0;
841}
842
843static noinline int check_block(struct btrfs_root *root,
844 struct btrfs_path *path, int level)
845{
846 return 0;
847 if (level == 0)
848 return check_leaf(root, path, level);
849 return check_node(root, path, level);
850}
851 726
852/* 727/*
853 * search for key in the extent_buffer. The items start at offset p, 728 * search for key in the extent_buffer. The items start at offset p,
@@ -1008,7 +883,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1008 int wret; 883 int wret;
1009 int pslot; 884 int pslot;
1010 int orig_slot = path->slots[level]; 885 int orig_slot = path->slots[level];
1011 int err_on_enospc = 0;
1012 u64 orig_ptr; 886 u64 orig_ptr;
1013 887
1014 if (level == 0) 888 if (level == 0)
@@ -1047,9 +921,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1047 goto enospc; 921 goto enospc;
1048 } 922 }
1049 923
1050 spin_lock(&root->node_lock); 924 rcu_assign_pointer(root->node, child);
1051 root->node = child;
1052 spin_unlock(&root->node_lock);
1053 925
1054 add_root_to_dirty_list(root); 926 add_root_to_dirty_list(root);
1055 btrfs_tree_unlock(child); 927 btrfs_tree_unlock(child);
@@ -1071,8 +943,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1071 BTRFS_NODEPTRS_PER_BLOCK(root) / 4) 943 BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1072 return 0; 944 return 0;
1073 945
1074 if (btrfs_header_nritems(mid) < 2) 946 btrfs_header_nritems(mid);
1075 err_on_enospc = 1;
1076 947
1077 left = read_node_slot(root, parent, pslot - 1); 948 left = read_node_slot(root, parent, pslot - 1);
1078 if (left) { 949 if (left) {
@@ -1103,8 +974,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1103 wret = push_node_left(trans, root, left, mid, 1); 974 wret = push_node_left(trans, root, left, mid, 1);
1104 if (wret < 0) 975 if (wret < 0)
1105 ret = wret; 976 ret = wret;
1106 if (btrfs_header_nritems(mid) < 2) 977 btrfs_header_nritems(mid);
1107 err_on_enospc = 1;
1108 } 978 }
1109 979
1110 /* 980 /*
@@ -1191,7 +1061,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
1191 } 1061 }
1192 } 1062 }
1193 /* double check we haven't messed things up */ 1063 /* double check we haven't messed things up */
1194 check_block(root, path, level);
1195 if (orig_ptr != 1064 if (orig_ptr !=
1196 btrfs_node_blockptr(path->nodes[level], path->slots[level])) 1065 btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1197 BUG(); 1066 BUG();
@@ -1224,14 +1093,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1224 int wret; 1093 int wret;
1225 int pslot; 1094 int pslot;
1226 int orig_slot = path->slots[level]; 1095 int orig_slot = path->slots[level];
1227 u64 orig_ptr;
1228 1096
1229 if (level == 0) 1097 if (level == 0)
1230 return 1; 1098 return 1;
1231 1099
1232 mid = path->nodes[level]; 1100 mid = path->nodes[level];
1233 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1101 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1234 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1235 1102
1236 if (level < BTRFS_MAX_LEVEL - 1) 1103 if (level < BTRFS_MAX_LEVEL - 1)
1237 parent = path->nodes[level + 1]; 1104 parent = path->nodes[level + 1];
@@ -1355,11 +1222,13 @@ static void reada_for_search(struct btrfs_root *root,
1355 u64 search; 1222 u64 search;
1356 u64 target; 1223 u64 target;
1357 u64 nread = 0; 1224 u64 nread = 0;
1225 u64 gen;
1358 int direction = path->reada; 1226 int direction = path->reada;
1359 struct extent_buffer *eb; 1227 struct extent_buffer *eb;
1360 u32 nr; 1228 u32 nr;
1361 u32 blocksize; 1229 u32 blocksize;
1362 u32 nscan = 0; 1230 u32 nscan = 0;
1231 bool map = true;
1363 1232
1364 if (level != 1) 1233 if (level != 1)
1365 return; 1234 return;
@@ -1381,7 +1250,19 @@ static void reada_for_search(struct btrfs_root *root,
1381 1250
1382 nritems = btrfs_header_nritems(node); 1251 nritems = btrfs_header_nritems(node);
1383 nr = slot; 1252 nr = slot;
1253 if (node->map_token || path->skip_locking)
1254 map = false;
1255
1384 while (1) { 1256 while (1) {
1257 if (map && !node->map_token) {
1258 unsigned long offset = btrfs_node_key_ptr_offset(nr);
1259 map_private_extent_buffer(node, offset,
1260 sizeof(struct btrfs_key_ptr),
1261 &node->map_token,
1262 &node->kaddr,
1263 &node->map_start,
1264 &node->map_len, KM_USER1);
1265 }
1385 if (direction < 0) { 1266 if (direction < 0) {
1386 if (nr == 0) 1267 if (nr == 0)
1387 break; 1268 break;
@@ -1399,14 +1280,23 @@ static void reada_for_search(struct btrfs_root *root,
1399 search = btrfs_node_blockptr(node, nr); 1280 search = btrfs_node_blockptr(node, nr);
1400 if ((search <= target && target - search <= 65536) || 1281 if ((search <= target && target - search <= 65536) ||
1401 (search > target && search - target <= 65536)) { 1282 (search > target && search - target <= 65536)) {
1402 readahead_tree_block(root, search, blocksize, 1283 gen = btrfs_node_ptr_generation(node, nr);
1403 btrfs_node_ptr_generation(node, nr)); 1284 if (map && node->map_token) {
1285 unmap_extent_buffer(node, node->map_token,
1286 KM_USER1);
1287 node->map_token = NULL;
1288 }
1289 readahead_tree_block(root, search, blocksize, gen);
1404 nread += blocksize; 1290 nread += blocksize;
1405 } 1291 }
1406 nscan++; 1292 nscan++;
1407 if ((nread > 65536 || nscan > 32)) 1293 if ((nread > 65536 || nscan > 32))
1408 break; 1294 break;
1409 } 1295 }
1296 if (map && node->map_token) {
1297 unmap_extent_buffer(node, node->map_token, KM_USER1);
1298 node->map_token = NULL;
1299 }
1410} 1300}
1411 1301
1412/* 1302/*
@@ -1454,7 +1344,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
1454 ret = -EAGAIN; 1344 ret = -EAGAIN;
1455 1345
1456 /* release the whole path */ 1346 /* release the whole path */
1457 btrfs_release_path(root, path); 1347 btrfs_release_path(path);
1458 1348
1459 /* read the blocks */ 1349 /* read the blocks */
1460 if (block1) 1350 if (block1)
@@ -1577,13 +1467,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1577 blocksize = btrfs_level_size(root, level - 1); 1467 blocksize = btrfs_level_size(root, level - 1);
1578 1468
1579 tmp = btrfs_find_tree_block(root, blocknr, blocksize); 1469 tmp = btrfs_find_tree_block(root, blocknr, blocksize);
1580 if (tmp && btrfs_buffer_uptodate(tmp, gen)) { 1470 if (tmp) {
1581 /* 1471 if (btrfs_buffer_uptodate(tmp, 0)) {
1582 * we found an up to date block without sleeping, return 1472 if (btrfs_buffer_uptodate(tmp, gen)) {
1583 * right away 1473 /*
1584 */ 1474 * we found an up to date block without
1585 *eb_ret = tmp; 1475 * sleeping, return
1586 return 0; 1476 * right away
1477 */
1478 *eb_ret = tmp;
1479 return 0;
1480 }
1481 /* the pages were up to date, but we failed
1482 * the generation number check. Do a full
1483 * read for the generation number that is correct.
1484 * We must do this without dropping locks so
1485 * we can trust our generation number
1486 */
1487 free_extent_buffer(tmp);
1488 tmp = read_tree_block(root, blocknr, blocksize, gen);
1489 if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
1490 *eb_ret = tmp;
1491 return 0;
1492 }
1493 free_extent_buffer(tmp);
1494 btrfs_release_path(p);
1495 return -EIO;
1496 }
1587 } 1497 }
1588 1498
1589 /* 1499 /*
@@ -1596,12 +1506,11 @@ read_block_for_search(struct btrfs_trans_handle *trans,
1596 btrfs_unlock_up_safe(p, level + 1); 1506 btrfs_unlock_up_safe(p, level + 1);
1597 btrfs_set_path_blocking(p); 1507 btrfs_set_path_blocking(p);
1598 1508
1599 if (tmp) 1509 free_extent_buffer(tmp);
1600 free_extent_buffer(tmp);
1601 if (p->reada) 1510 if (p->reada)
1602 reada_for_search(root, p, level, slot, key->objectid); 1511 reada_for_search(root, p, level, slot, key->objectid);
1603 1512
1604 btrfs_release_path(NULL, p); 1513 btrfs_release_path(p);
1605 1514
1606 ret = -EAGAIN; 1515 ret = -EAGAIN;
1607 tmp = read_tree_block(root, blocknr, blocksize, 0); 1516 tmp = read_tree_block(root, blocknr, blocksize, 0);
@@ -1670,7 +1579,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
1670 } 1579 }
1671 b = p->nodes[level]; 1580 b = p->nodes[level];
1672 if (!b) { 1581 if (!b) {
1673 btrfs_release_path(NULL, p); 1582 btrfs_release_path(p);
1674 goto again; 1583 goto again;
1675 } 1584 }
1676 BUG_ON(btrfs_header_nritems(b) == 1); 1585 BUG_ON(btrfs_header_nritems(b) == 1);
@@ -1760,9 +1669,6 @@ again:
1760 } 1669 }
1761cow_done: 1670cow_done:
1762 BUG_ON(!cow && ins_len); 1671 BUG_ON(!cow && ins_len);
1763 if (level != btrfs_header_level(b))
1764 WARN_ON(1);
1765 level = btrfs_header_level(b);
1766 1672
1767 p->nodes[level] = b; 1673 p->nodes[level] = b;
1768 if (!p->skip_locking) 1674 if (!p->skip_locking)
@@ -1784,12 +1690,6 @@ cow_done:
1784 if (!cow) 1690 if (!cow)
1785 btrfs_unlock_up_safe(p, level + 1); 1691 btrfs_unlock_up_safe(p, level + 1);
1786 1692
1787 ret = check_block(root, p, level);
1788 if (ret) {
1789 ret = -1;
1790 goto done;
1791 }
1792
1793 ret = bin_search(b, key, level, &slot); 1693 ret = bin_search(b, key, level, &slot);
1794 1694
1795 if (level != 0) { 1695 if (level != 0) {
@@ -1866,7 +1766,7 @@ done:
1866 if (!p->leave_spinning) 1766 if (!p->leave_spinning)
1867 btrfs_set_path_blocking(p); 1767 btrfs_set_path_blocking(p);
1868 if (ret < 0) 1768 if (ret < 0)
1869 btrfs_release_path(root, p); 1769 btrfs_release_path(p);
1870 return ret; 1770 return ret;
1871} 1771}
1872 1772
@@ -2116,10 +2016,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2116 2016
2117 btrfs_mark_buffer_dirty(c); 2017 btrfs_mark_buffer_dirty(c);
2118 2018
2119 spin_lock(&root->node_lock);
2120 old = root->node; 2019 old = root->node;
2121 root->node = c; 2020 rcu_assign_pointer(root->node, c);
2122 spin_unlock(&root->node_lock);
2123 2021
2124 /* the super has an extra ref to root->node */ 2022 /* the super has an extra ref to root->node */
2125 free_extent_buffer(old); 2023 free_extent_buffer(old);
@@ -2502,6 +2400,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
2502 btrfs_assert_tree_locked(path->nodes[1]); 2400 btrfs_assert_tree_locked(path->nodes[1]);
2503 2401
2504 right = read_node_slot(root, upper, slot + 1); 2402 right = read_node_slot(root, upper, slot + 1);
2403 if (right == NULL)
2404 return 1;
2405
2505 btrfs_tree_lock(right); 2406 btrfs_tree_lock(right);
2506 btrfs_set_lock_blocking(right); 2407 btrfs_set_lock_blocking(right);
2507 2408
@@ -2548,7 +2449,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2548{ 2449{
2549 struct btrfs_disk_key disk_key; 2450 struct btrfs_disk_key disk_key;
2550 struct extent_buffer *right = path->nodes[0]; 2451 struct extent_buffer *right = path->nodes[0];
2551 int slot;
2552 int i; 2452 int i;
2553 int push_space = 0; 2453 int push_space = 0;
2554 int push_items = 0; 2454 int push_items = 0;
@@ -2560,8 +2460,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
2560 u32 this_item_size; 2460 u32 this_item_size;
2561 u32 old_left_item_size; 2461 u32 old_left_item_size;
2562 2462
2563 slot = path->slots[1];
2564
2565 if (empty) 2463 if (empty)
2566 nr = min(right_nritems, max_slot); 2464 nr = min(right_nritems, max_slot);
2567 else 2465 else
@@ -2755,6 +2653,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
2755 btrfs_assert_tree_locked(path->nodes[1]); 2653 btrfs_assert_tree_locked(path->nodes[1]);
2756 2654
2757 left = read_node_slot(root, path->nodes[1], slot - 1); 2655 left = read_node_slot(root, path->nodes[1], slot - 1);
2656 if (left == NULL)
2657 return 1;
2658
2758 btrfs_tree_lock(left); 2659 btrfs_tree_lock(left);
2759 btrfs_set_lock_blocking(left); 2660 btrfs_set_lock_blocking(left);
2760 2661
@@ -3138,7 +3039,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3138 struct btrfs_file_extent_item); 3039 struct btrfs_file_extent_item);
3139 extent_len = btrfs_file_extent_num_bytes(leaf, fi); 3040 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3140 } 3041 }
3141 btrfs_release_path(root, path); 3042 btrfs_release_path(path);
3142 3043
3143 path->keep_locks = 1; 3044 path->keep_locks = 1;
3144 path->search_for_split = 1; 3045 path->search_for_split = 1;
@@ -3328,9 +3229,7 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3328 struct btrfs_path *path, 3229 struct btrfs_path *path,
3329 u32 new_size, int from_end) 3230 u32 new_size, int from_end)
3330{ 3231{
3331 int ret = 0;
3332 int slot; 3232 int slot;
3333 int slot_orig;
3334 struct extent_buffer *leaf; 3233 struct extent_buffer *leaf;
3335 struct btrfs_item *item; 3234 struct btrfs_item *item;
3336 u32 nritems; 3235 u32 nritems;
@@ -3340,7 +3239,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3340 unsigned int size_diff; 3239 unsigned int size_diff;
3341 int i; 3240 int i;
3342 3241
3343 slot_orig = path->slots[0];
3344 leaf = path->nodes[0]; 3242 leaf = path->nodes[0];
3345 slot = path->slots[0]; 3243 slot = path->slots[0];
3346 3244
@@ -3428,12 +3326,11 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
3428 btrfs_set_item_size(leaf, item, new_size); 3326 btrfs_set_item_size(leaf, item, new_size);
3429 btrfs_mark_buffer_dirty(leaf); 3327 btrfs_mark_buffer_dirty(leaf);
3430 3328
3431 ret = 0;
3432 if (btrfs_leaf_free_space(root, leaf) < 0) { 3329 if (btrfs_leaf_free_space(root, leaf) < 0) {
3433 btrfs_print_leaf(root, leaf); 3330 btrfs_print_leaf(root, leaf);
3434 BUG(); 3331 BUG();
3435 } 3332 }
3436 return ret; 3333 return 0;
3437} 3334}
3438 3335
3439/* 3336/*
@@ -3443,9 +3340,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3443 struct btrfs_root *root, struct btrfs_path *path, 3340 struct btrfs_root *root, struct btrfs_path *path,
3444 u32 data_size) 3341 u32 data_size)
3445{ 3342{
3446 int ret = 0;
3447 int slot; 3343 int slot;
3448 int slot_orig;
3449 struct extent_buffer *leaf; 3344 struct extent_buffer *leaf;
3450 struct btrfs_item *item; 3345 struct btrfs_item *item;
3451 u32 nritems; 3346 u32 nritems;
@@ -3454,7 +3349,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3454 unsigned int old_size; 3349 unsigned int old_size;
3455 int i; 3350 int i;
3456 3351
3457 slot_orig = path->slots[0];
3458 leaf = path->nodes[0]; 3352 leaf = path->nodes[0];
3459 3353
3460 nritems = btrfs_header_nritems(leaf); 3354 nritems = btrfs_header_nritems(leaf);
@@ -3510,12 +3404,11 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
3510 btrfs_set_item_size(leaf, item, old_size + data_size); 3404 btrfs_set_item_size(leaf, item, old_size + data_size);
3511 btrfs_mark_buffer_dirty(leaf); 3405 btrfs_mark_buffer_dirty(leaf);
3512 3406
3513 ret = 0;
3514 if (btrfs_leaf_free_space(root, leaf) < 0) { 3407 if (btrfs_leaf_free_space(root, leaf) < 0) {
3515 btrfs_print_leaf(root, leaf); 3408 btrfs_print_leaf(root, leaf);
3516 BUG(); 3409 BUG();
3517 } 3410 }
3518 return ret; 3411 return 0;
3519} 3412}
3520 3413
3521/* 3414/*
@@ -3675,11 +3568,10 @@ out:
3675 * to save stack depth by doing the bulk of the work in a function 3568 * to save stack depth by doing the bulk of the work in a function
3676 * that doesn't call btrfs_search_slot 3569 * that doesn't call btrfs_search_slot
3677 */ 3570 */
3678static noinline_for_stack int 3571int setup_items_for_insert(struct btrfs_trans_handle *trans,
3679setup_items_for_insert(struct btrfs_trans_handle *trans, 3572 struct btrfs_root *root, struct btrfs_path *path,
3680 struct btrfs_root *root, struct btrfs_path *path, 3573 struct btrfs_key *cpu_key, u32 *data_size,
3681 struct btrfs_key *cpu_key, u32 *data_size, 3574 u32 total_data, u32 total_size, int nr)
3682 u32 total_data, u32 total_size, int nr)
3683{ 3575{
3684 struct btrfs_item *item; 3576 struct btrfs_item *item;
3685 int i; 3577 int i;
@@ -3763,7 +3655,6 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
3763 3655
3764 ret = 0; 3656 ret = 0;
3765 if (slot == 0) { 3657 if (slot == 0) {
3766 struct btrfs_disk_key disk_key;
3767 btrfs_cpu_key_to_disk(&disk_key, cpu_key); 3658 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
3768 ret = fixup_low_keys(trans, root, path, &disk_key, 1); 3659 ret = fixup_low_keys(trans, root, path, &disk_key, 1);
3769 } 3660 }
@@ -3787,7 +3678,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3787 struct btrfs_key *cpu_key, u32 *data_size, 3678 struct btrfs_key *cpu_key, u32 *data_size,
3788 int nr) 3679 int nr)
3789{ 3680{
3790 struct extent_buffer *leaf;
3791 int ret = 0; 3681 int ret = 0;
3792 int slot; 3682 int slot;
3793 int i; 3683 int i;
@@ -3804,7 +3694,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
3804 if (ret < 0) 3694 if (ret < 0)
3805 goto out; 3695 goto out;
3806 3696
3807 leaf = path->nodes[0];
3808 slot = path->slots[0]; 3697 slot = path->slots[0];
3809 BUG_ON(slot < 0); 3698 BUG_ON(slot < 0);
3810 3699
@@ -3829,7 +3718,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
3829 unsigned long ptr; 3718 unsigned long ptr;
3830 3719
3831 path = btrfs_alloc_path(); 3720 path = btrfs_alloc_path();
3832 BUG_ON(!path); 3721 if (!path)
3722 return -ENOMEM;
3833 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); 3723 ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
3834 if (!ret) { 3724 if (!ret) {
3835 leaf = path->nodes[0]; 3725 leaf = path->nodes[0];
@@ -4066,7 +3956,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
4066 else 3956 else
4067 return 1; 3957 return 1;
4068 3958
4069 btrfs_release_path(root, path); 3959 btrfs_release_path(path);
4070 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 3960 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4071 if (ret < 0) 3961 if (ret < 0)
4072 return ret; 3962 return ret;
@@ -4190,7 +4080,7 @@ find_next_key:
4190 sret = btrfs_find_next_key(root, path, min_key, level, 4080 sret = btrfs_find_next_key(root, path, min_key, level,
4191 cache_only, min_trans); 4081 cache_only, min_trans);
4192 if (sret == 0) { 4082 if (sret == 0) {
4193 btrfs_release_path(root, path); 4083 btrfs_release_path(path);
4194 goto again; 4084 goto again;
4195 } else { 4085 } else {
4196 goto out; 4086 goto out;
@@ -4206,6 +4096,7 @@ find_next_key:
4206 } 4096 }
4207 btrfs_set_path_blocking(path); 4097 btrfs_set_path_blocking(path);
4208 cur = read_node_slot(root, cur, slot); 4098 cur = read_node_slot(root, cur, slot);
4099 BUG_ON(!cur);
4209 4100
4210 btrfs_tree_lock(cur); 4101 btrfs_tree_lock(cur);
4211 4102
@@ -4268,7 +4159,7 @@ next:
4268 btrfs_node_key_to_cpu(c, &cur_key, slot); 4159 btrfs_node_key_to_cpu(c, &cur_key, slot);
4269 4160
4270 orig_lowest = path->lowest_level; 4161 orig_lowest = path->lowest_level;
4271 btrfs_release_path(root, path); 4162 btrfs_release_path(path);
4272 path->lowest_level = level; 4163 path->lowest_level = level;
4273 ret = btrfs_search_slot(NULL, root, &cur_key, path, 4164 ret = btrfs_search_slot(NULL, root, &cur_key, path,
4274 0, 0); 4165 0, 0);
@@ -4345,7 +4236,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
4345again: 4236again:
4346 level = 1; 4237 level = 1;
4347 next = NULL; 4238 next = NULL;
4348 btrfs_release_path(root, path); 4239 btrfs_release_path(path);
4349 4240
4350 path->keep_locks = 1; 4241 path->keep_locks = 1;
4351 4242
@@ -4401,7 +4292,7 @@ again:
4401 goto again; 4292 goto again;
4402 4293
4403 if (ret < 0) { 4294 if (ret < 0) {
4404 btrfs_release_path(root, path); 4295 btrfs_release_path(path);
4405 goto done; 4296 goto done;
4406 } 4297 }
4407 4298
@@ -4440,7 +4331,7 @@ again:
4440 goto again; 4331 goto again;
4441 4332
4442 if (ret < 0) { 4333 if (ret < 0) {
4443 btrfs_release_path(root, path); 4334 btrfs_release_path(path);
4444 goto done; 4335 goto done;
4445 } 4336 }
4446 4337
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eaf286abad17..3b859a3e6a0e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -19,18 +19,21 @@
19#ifndef __BTRFS_CTREE__ 19#ifndef __BTRFS_CTREE__
20#define __BTRFS_CTREE__ 20#define __BTRFS_CTREE__
21 21
22#include <linux/version.h>
23#include <linux/mm.h> 22#include <linux/mm.h>
24#include <linux/highmem.h> 23#include <linux/highmem.h>
25#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/rwsem.h>
26#include <linux/completion.h> 26#include <linux/completion.h>
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/wait.h> 28#include <linux/wait.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/kobject.h>
31#include <trace/events/btrfs.h>
30#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
31#include "extent_io.h" 33#include "extent_io.h"
32#include "extent_map.h" 34#include "extent_map.h"
33#include "async-thread.h" 35#include "async-thread.h"
36#include "ioctl.h"
34 37
35struct btrfs_trans_handle; 38struct btrfs_trans_handle;
36struct btrfs_transaction; 39struct btrfs_transaction;
@@ -39,6 +42,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
39extern struct kmem_cache *btrfs_transaction_cachep; 42extern struct kmem_cache *btrfs_transaction_cachep;
40extern struct kmem_cache *btrfs_bit_radix_cachep; 43extern struct kmem_cache *btrfs_bit_radix_cachep;
41extern struct kmem_cache *btrfs_path_cachep; 44extern struct kmem_cache *btrfs_path_cachep;
45extern struct kmem_cache *btrfs_free_space_cachep;
42struct btrfs_ordered_sum; 46struct btrfs_ordered_sum;
43 47
44#define BTRFS_MAGIC "_BHRfS_M" 48#define BTRFS_MAGIC "_BHRfS_M"
@@ -99,6 +103,15 @@ struct btrfs_ordered_sum;
99 */ 103 */
100#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL 104#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
101 105
106/* For storing free space cache */
107#define BTRFS_FREE_SPACE_OBJECTID -11ULL
108
109/*
110 * The inode number assigned to the special inode for sotring
111 * free ino cache
112 */
113#define BTRFS_FREE_INO_OBJECTID -12ULL
114
102/* dummy objectid represents multiple objectids */ 115/* dummy objectid represents multiple objectids */
103#define BTRFS_MULTIPLE_OBJECTIDS -255ULL 116#define BTRFS_MULTIPLE_OBJECTIDS -255ULL
104 117
@@ -181,7 +194,6 @@ struct btrfs_mapping_tree {
181 struct extent_map_tree map_tree; 194 struct extent_map_tree map_tree;
182}; 195};
183 196
184#define BTRFS_UUID_SIZE 16
185struct btrfs_dev_item { 197struct btrfs_dev_item {
186 /* the internal btrfs device id */ 198 /* the internal btrfs device id */
187 __le64 devid; 199 __le64 devid;
@@ -265,6 +277,22 @@ struct btrfs_chunk {
265 /* additional stripes go here */ 277 /* additional stripes go here */
266} __attribute__ ((__packed__)); 278} __attribute__ ((__packed__));
267 279
280#define BTRFS_FREE_SPACE_EXTENT 1
281#define BTRFS_FREE_SPACE_BITMAP 2
282
283struct btrfs_free_space_entry {
284 __le64 offset;
285 __le64 bytes;
286 u8 type;
287} __attribute__ ((__packed__));
288
289struct btrfs_free_space_header {
290 struct btrfs_disk_key location;
291 __le64 generation;
292 __le64 num_entries;
293 __le64 num_bitmaps;
294} __attribute__ ((__packed__));
295
268static inline unsigned long btrfs_chunk_item_size(int num_stripes) 296static inline unsigned long btrfs_chunk_item_size(int num_stripes)
269{ 297{
270 BUG_ON(num_stripes == 0); 298 BUG_ON(num_stripes == 0);
@@ -272,9 +300,16 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
272 sizeof(struct btrfs_stripe) * (num_stripes - 1); 300 sizeof(struct btrfs_stripe) * (num_stripes - 1);
273} 301}
274 302
275#define BTRFS_FSID_SIZE 16
276#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0) 303#define BTRFS_HEADER_FLAG_WRITTEN (1ULL << 0)
277#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1) 304#define BTRFS_HEADER_FLAG_RELOC (1ULL << 1)
305
306/*
307 * File system states
308 */
309
310/* Errors detected */
311#define BTRFS_SUPER_FLAG_ERROR (1ULL << 2)
312
278#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) 313#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
279#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33) 314#define BTRFS_SUPER_FLAG_METADUMP (1ULL << 33)
280 315
@@ -365,8 +400,10 @@ struct btrfs_super_block {
365 400
366 char label[BTRFS_LABEL_SIZE]; 401 char label[BTRFS_LABEL_SIZE];
367 402
403 __le64 cache_generation;
404
368 /* future expansion */ 405 /* future expansion */
369 __le64 reserved[32]; 406 __le64 reserved[31];
370 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
371} __attribute__ ((__packed__)); 408} __attribute__ ((__packed__));
372 409
@@ -375,13 +412,17 @@ struct btrfs_super_block {
375 * ones specified below then we will fail to mount 412 * ones specified below then we will fail to mount
376 */ 413 */
377#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 414#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
378#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0) 415#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1)
416#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2)
417#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO (1ULL << 3)
379 418
380#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 419#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
381#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 420#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
382#define BTRFS_FEATURE_INCOMPAT_SUPP \ 421#define BTRFS_FEATURE_INCOMPAT_SUPP \
383 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ 422 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
384 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL) 423 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
424 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
425 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
385 426
386/* 427/*
387 * A leaf is full of items. offset and size tell us where to find 428 * A leaf is full of items. offset and size tell us where to find
@@ -474,6 +515,12 @@ struct btrfs_extent_item_v0 {
474/* use full backrefs for extent pointers in the block */ 515/* use full backrefs for extent pointers in the block */
475#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) 516#define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8)
476 517
518/*
519 * this flag is only used internally by scrub and may be changed at any time
520 * it is only declared here to avoid collisions
521 */
522#define BTRFS_EXTENT_FLAG_SUPER (1ULL << 48)
523
477struct btrfs_tree_block_info { 524struct btrfs_tree_block_info {
478 struct btrfs_disk_key key; 525 struct btrfs_disk_key key;
479 u8 level; 526 u8 level;
@@ -528,9 +575,11 @@ struct btrfs_timespec {
528} __attribute__ ((__packed__)); 575} __attribute__ ((__packed__));
529 576
530enum btrfs_compression_type { 577enum btrfs_compression_type {
531 BTRFS_COMPRESS_NONE = 0, 578 BTRFS_COMPRESS_NONE = 0,
532 BTRFS_COMPRESS_ZLIB = 1, 579 BTRFS_COMPRESS_ZLIB = 1,
533 BTRFS_COMPRESS_LAST = 2, 580 BTRFS_COMPRESS_LZO = 2,
581 BTRFS_COMPRESS_TYPES = 2,
582 BTRFS_COMPRESS_LAST = 3,
534}; 583};
535 584
536struct btrfs_inode_item { 585struct btrfs_inode_item {
@@ -574,6 +623,8 @@ struct btrfs_dir_item {
574 u8 type; 623 u8 type;
575} __attribute__ ((__packed__)); 624} __attribute__ ((__packed__));
576 625
626#define BTRFS_ROOT_SUBVOL_RDONLY (1ULL << 0)
627
577struct btrfs_root_item { 628struct btrfs_root_item {
578 struct btrfs_inode_item inode; 629 struct btrfs_inode_item inode;
579 __le64 generation; 630 __le64 generation;
@@ -675,9 +726,10 @@ struct btrfs_block_group_item {
675struct btrfs_space_info { 726struct btrfs_space_info {
676 u64 flags; 727 u64 flags;
677 728
678 u64 total_bytes; /* total bytes in the space */ 729 u64 total_bytes; /* total bytes in the space,
730 this doesn't take mirrors into account */
679 u64 bytes_used; /* total bytes used, 731 u64 bytes_used; /* total bytes used,
680 this does't take mirrors into account */ 732 this doesn't take mirrors into account */
681 u64 bytes_pinned; /* total bytes pinned, will be freed when the 733 u64 bytes_pinned; /* total bytes pinned, will be freed when the
682 transaction finishes */ 734 transaction finishes */
683 u64 bytes_reserved; /* total bytes the allocator has reserved for 735 u64 bytes_reserved; /* total bytes the allocator has reserved for
@@ -687,11 +739,24 @@ struct btrfs_space_info {
687 u64 bytes_may_use; /* number of bytes that may be used for 739 u64 bytes_may_use; /* number of bytes that may be used for
688 delalloc/allocations */ 740 delalloc/allocations */
689 u64 disk_used; /* total bytes used on disk */ 741 u64 disk_used; /* total bytes used on disk */
742 u64 disk_total; /* total bytes on disk, takes mirrors into
743 account */
744
745 /*
746 * we bump reservation progress every time we decrement
747 * bytes_reserved. This way people waiting for reservations
748 * know something good has happened and they can check
749 * for progress. The number here isn't to be trusted, it
750 * just shows reclaim activity
751 */
752 unsigned long reservation_progress;
690 753
691 int full; /* indicates that we cannot allocate any more 754 unsigned int full:1; /* indicates that we cannot allocate any more
692 chunks for this space */ 755 chunks for this space */
693 int force_alloc; /* set if we need to force a chunk alloc for 756 unsigned int chunk_alloc:1; /* set if we are allocating a chunk */
694 this space */ 757
758 unsigned int force_alloc; /* set if we need to force a chunk
759 alloc for this space */
695 760
696 struct list_head list; 761 struct list_head list;
697 762
@@ -732,9 +797,6 @@ struct btrfs_free_cluster {
732 /* first extent starting offset */ 797 /* first extent starting offset */
733 u64 window_start; 798 u64 window_start;
734 799
735 /* if this cluster simply points at a bitmap in the block group */
736 bool points_to_bitmap;
737
738 struct btrfs_block_group_cache *block_group; 800 struct btrfs_block_group_cache *block_group;
739 /* 801 /*
740 * when a cluster is allocated from a block group, we put the 802 * when a cluster is allocated from a block group, we put the
@@ -750,6 +812,14 @@ enum btrfs_caching_type {
750 BTRFS_CACHE_FINISHED = 2, 812 BTRFS_CACHE_FINISHED = 2,
751}; 813};
752 814
815enum btrfs_disk_cache_state {
816 BTRFS_DC_WRITTEN = 0,
817 BTRFS_DC_ERROR = 1,
818 BTRFS_DC_CLEAR = 2,
819 BTRFS_DC_SETUP = 3,
820 BTRFS_DC_NEED_WRITE = 4,
821};
822
753struct btrfs_caching_control { 823struct btrfs_caching_control {
754 struct list_head list; 824 struct list_head list;
755 struct mutex mutex; 825 struct mutex mutex;
@@ -763,6 +833,7 @@ struct btrfs_block_group_cache {
763 struct btrfs_key key; 833 struct btrfs_key key;
764 struct btrfs_block_group_item item; 834 struct btrfs_block_group_item item;
765 struct btrfs_fs_info *fs_info; 835 struct btrfs_fs_info *fs_info;
836 struct inode *inode;
766 spinlock_t lock; 837 spinlock_t lock;
767 u64 pinned; 838 u64 pinned;
768 u64 reserved; 839 u64 reserved;
@@ -770,11 +841,11 @@ struct btrfs_block_group_cache {
770 u64 bytes_super; 841 u64 bytes_super;
771 u64 flags; 842 u64 flags;
772 u64 sectorsize; 843 u64 sectorsize;
773 int extents_thresh; 844 unsigned int ro:1;
774 int free_extents; 845 unsigned int dirty:1;
775 int total_bitmaps; 846 unsigned int iref:1;
776 int ro; 847
777 int dirty; 848 int disk_cache_state;
778 849
779 /* cache tracking stuff */ 850 /* cache tracking stuff */
780 int cached; 851 int cached;
@@ -784,9 +855,7 @@ struct btrfs_block_group_cache {
784 struct btrfs_space_info *space_info; 855 struct btrfs_space_info *space_info;
785 856
786 /* free space cache stuff */ 857 /* free space cache stuff */
787 spinlock_t tree_lock; 858 struct btrfs_free_space_ctl *free_space_ctl;
788 struct rb_root free_space_offset;
789 u64 free_space;
790 859
791 /* block group cache stuff */ 860 /* block group cache stuff */
792 struct rb_node cache_node; 861 struct rb_node cache_node;
@@ -806,6 +875,7 @@ struct btrfs_block_group_cache {
806struct reloc_control; 875struct reloc_control;
807struct btrfs_device; 876struct btrfs_device;
808struct btrfs_fs_devices; 877struct btrfs_fs_devices;
878struct btrfs_delayed_root;
809struct btrfs_fs_info { 879struct btrfs_fs_info {
810 u8 fsid[BTRFS_FSID_SIZE]; 880 u8 fsid[BTRFS_FSID_SIZE];
811 u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; 881 u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -832,7 +902,10 @@ struct btrfs_fs_info {
832 /* logical->physical extent mapping */ 902 /* logical->physical extent mapping */
833 struct btrfs_mapping_tree mapping_tree; 903 struct btrfs_mapping_tree mapping_tree;
834 904
835 /* block reservation for extent, checksum and root tree */ 905 /*
906 * block reservation for extent, checksum, root tree and
907 * delayed dir index item
908 */
836 struct btrfs_block_rsv global_block_rsv; 909 struct btrfs_block_rsv global_block_rsv;
837 /* block reservation for delay allocation */ 910 /* block reservation for delay allocation */
838 struct btrfs_block_rsv delalloc_block_rsv; 911 struct btrfs_block_rsv delalloc_block_rsv;
@@ -856,13 +929,14 @@ struct btrfs_fs_info {
856 * is required instead of the faster short fsync log commits 929 * is required instead of the faster short fsync log commits
857 */ 930 */
858 u64 last_trans_log_full_commit; 931 u64 last_trans_log_full_commit;
859 u64 open_ioctl_trans; 932 unsigned long mount_opt:20;
860 unsigned long mount_opt; 933 unsigned long compress_type:4;
861 u64 max_inline; 934 u64 max_inline;
862 u64 alloc_start; 935 u64 alloc_start;
863 struct btrfs_transaction *running_transaction; 936 struct btrfs_transaction *running_transaction;
864 wait_queue_head_t transaction_throttle; 937 wait_queue_head_t transaction_throttle;
865 wait_queue_head_t transaction_wait; 938 wait_queue_head_t transaction_wait;
939 wait_queue_head_t transaction_blocked_wait;
866 wait_queue_head_t async_submit_wait; 940 wait_queue_head_t async_submit_wait;
867 941
868 struct btrfs_super_block super_copy; 942 struct btrfs_super_block super_copy;
@@ -871,7 +945,6 @@ struct btrfs_fs_info {
871 struct super_block *sb; 945 struct super_block *sb;
872 struct inode *btree_inode; 946 struct inode *btree_inode;
873 struct backing_dev_info bdi; 947 struct backing_dev_info bdi;
874 struct mutex trans_mutex;
875 struct mutex tree_log_mutex; 948 struct mutex tree_log_mutex;
876 struct mutex transaction_kthread_mutex; 949 struct mutex transaction_kthread_mutex;
877 struct mutex cleaner_mutex; 950 struct mutex cleaner_mutex;
@@ -892,6 +965,13 @@ struct btrfs_fs_info {
892 struct rw_semaphore subvol_sem; 965 struct rw_semaphore subvol_sem;
893 struct srcu_struct subvol_srcu; 966 struct srcu_struct subvol_srcu;
894 967
968 spinlock_t trans_lock;
969 /*
970 * the reloc mutex goes with the trans lock, it is taken
971 * during commit to protect us from the relocation code
972 */
973 struct mutex reloc_mutex;
974
895 struct list_head trans_list; 975 struct list_head trans_list;
896 struct list_head hashers; 976 struct list_head hashers;
897 struct list_head dead_roots; 977 struct list_head dead_roots;
@@ -904,6 +984,7 @@ struct btrfs_fs_info {
904 atomic_t async_submit_draining; 984 atomic_t async_submit_draining;
905 atomic_t nr_async_bios; 985 atomic_t nr_async_bios;
906 atomic_t async_delalloc_pages; 986 atomic_t async_delalloc_pages;
987 atomic_t open_ioctl_trans;
907 988
908 /* 989 /*
909 * this is used by the balancing code to wait for all the pending 990 * this is used by the balancing code to wait for all the pending
@@ -949,6 +1030,7 @@ struct btrfs_fs_info {
949 struct btrfs_workers endio_meta_workers; 1030 struct btrfs_workers endio_meta_workers;
950 struct btrfs_workers endio_meta_write_workers; 1031 struct btrfs_workers endio_meta_write_workers;
951 struct btrfs_workers endio_write_workers; 1032 struct btrfs_workers endio_write_workers;
1033 struct btrfs_workers endio_freespace_worker;
952 struct btrfs_workers submit_workers; 1034 struct btrfs_workers submit_workers;
953 /* 1035 /*
954 * fixup workers take dirty pages that didn't properly go through 1036 * fixup workers take dirty pages that didn't properly go through
@@ -956,6 +1038,7 @@ struct btrfs_fs_info {
956 * for the sys_munmap function call path 1038 * for the sys_munmap function call path
957 */ 1039 */
958 struct btrfs_workers fixup_workers; 1040 struct btrfs_workers fixup_workers;
1041 struct btrfs_workers delayed_workers;
959 struct task_struct *transaction_kthread; 1042 struct task_struct *transaction_kthread;
960 struct task_struct *cleaner_kthread; 1043 struct task_struct *cleaner_kthread;
961 int thread_pool_size; 1044 int thread_pool_size;
@@ -966,6 +1049,7 @@ struct btrfs_fs_info {
966 int closing; 1049 int closing;
967 int log_root_recovering; 1050 int log_root_recovering;
968 int enospc_unlink; 1051 int enospc_unlink;
1052 int trans_no_join;
969 1053
970 u64 total_pinned; 1054 u64 total_pinned;
971 1055
@@ -987,7 +1071,6 @@ struct btrfs_fs_info {
987 struct reloc_control *reloc_ctl; 1071 struct reloc_control *reloc_ctl;
988 1072
989 spinlock_t delalloc_lock; 1073 spinlock_t delalloc_lock;
990 spinlock_t new_trans_lock;
991 u64 delalloc_bytes; 1074 u64 delalloc_bytes;
992 1075
993 /* data_alloc_cluster is only used in ssd mode */ 1076 /* data_alloc_cluster is only used in ssd mode */
@@ -996,6 +1079,11 @@ struct btrfs_fs_info {
996 /* all metadata allocations go through this cluster */ 1079 /* all metadata allocations go through this cluster */
997 struct btrfs_free_cluster meta_alloc_cluster; 1080 struct btrfs_free_cluster meta_alloc_cluster;
998 1081
1082 /* auto defrag inodes go here */
1083 spinlock_t defrag_inodes_lock;
1084 struct rb_root defrag_inodes;
1085 atomic_t defrag_running;
1086
999 spinlock_t ref_cache_lock; 1087 spinlock_t ref_cache_lock;
1000 u64 total_ref_cache_size; 1088 u64 total_ref_cache_size;
1001 1089
@@ -1010,6 +1098,22 @@ struct btrfs_fs_info {
1010 unsigned metadata_ratio; 1098 unsigned metadata_ratio;
1011 1099
1012 void *bdev_holder; 1100 void *bdev_holder;
1101
1102 /* private scrub information */
1103 struct mutex scrub_lock;
1104 atomic_t scrubs_running;
1105 atomic_t scrub_pause_req;
1106 atomic_t scrubs_paused;
1107 atomic_t scrub_cancel_req;
1108 wait_queue_head_t scrub_pause_wait;
1109 struct rw_semaphore scrub_super_lock;
1110 int scrub_workers_refcnt;
1111 struct btrfs_workers scrub_workers;
1112
1113 /* filesystem state */
1114 u64 fs_state;
1115
1116 struct btrfs_delayed_root *delayed_root;
1013}; 1117};
1014 1118
1015/* 1119/*
@@ -1019,9 +1123,6 @@ struct btrfs_fs_info {
1019struct btrfs_root { 1123struct btrfs_root {
1020 struct extent_buffer *node; 1124 struct extent_buffer *node;
1021 1125
1022 /* the node lock is held while changing the node pointer */
1023 spinlock_t node_lock;
1024
1025 struct extent_buffer *commit_root; 1126 struct extent_buffer *commit_root;
1026 struct btrfs_root *log_root; 1127 struct btrfs_root *log_root;
1027 struct btrfs_root *reloc_root; 1128 struct btrfs_root *reloc_root;
@@ -1038,6 +1139,16 @@ struct btrfs_root {
1038 spinlock_t accounting_lock; 1139 spinlock_t accounting_lock;
1039 struct btrfs_block_rsv *block_rsv; 1140 struct btrfs_block_rsv *block_rsv;
1040 1141
1142 /* free ino cache stuff */
1143 struct mutex fs_commit_mutex;
1144 struct btrfs_free_space_ctl *free_ino_ctl;
1145 enum btrfs_caching_type cached;
1146 spinlock_t cache_lock;
1147 wait_queue_head_t cache_wait;
1148 struct btrfs_free_space_ctl *free_ino_pinned;
1149 u64 cache_progress;
1150 struct inode *cache_inode;
1151
1041 struct mutex log_mutex; 1152 struct mutex log_mutex;
1042 wait_queue_head_t log_writer_wait; 1153 wait_queue_head_t log_writer_wait;
1043 wait_queue_head_t log_commit_wait[2]; 1154 wait_queue_head_t log_commit_wait[2];
@@ -1066,6 +1177,14 @@ struct btrfs_root {
1066 u32 type; 1177 u32 type;
1067 1178
1068 u64 highest_objectid; 1179 u64 highest_objectid;
1180
1181 /* btrfs_record_root_in_trans is a multi-step process,
1182 * and it can race with the balancing code. But the
1183 * race is very small, and only the first time the root
1184 * is added to each transaction. So in_trans_setup
1185 * is used to tell us when more checks are required
1186 */
1187 unsigned long in_trans_setup;
1069 int ref_cows; 1188 int ref_cows;
1070 int track_dirty; 1189 int track_dirty;
1071 int in_radix; 1190 int in_radix;
@@ -1075,7 +1194,6 @@ struct btrfs_root {
1075 struct btrfs_key defrag_max; 1194 struct btrfs_key defrag_max;
1076 int defrag_running; 1195 int defrag_running;
1077 char *name; 1196 char *name;
1078 int in_sysfs;
1079 1197
1080 /* the dirty list is only used by non-reference counted roots */ 1198 /* the dirty list is only used by non-reference counted roots */
1081 struct list_head dirty_list; 1199 struct list_head dirty_list;
@@ -1093,12 +1211,49 @@ struct btrfs_root {
1093 struct rb_root inode_tree; 1211 struct rb_root inode_tree;
1094 1212
1095 /* 1213 /*
1214 * radix tree that keeps track of delayed nodes of every inode,
1215 * protected by inode_lock
1216 */
1217 struct radix_tree_root delayed_nodes_tree;
1218 /*
1096 * right now this just gets used so that a root has its own devid 1219 * right now this just gets used so that a root has its own devid
1097 * for stat. It may be used for more later 1220 * for stat. It may be used for more later
1098 */ 1221 */
1099 struct super_block anon_super; 1222 struct super_block anon_super;
1100}; 1223};
1101 1224
1225struct btrfs_ioctl_defrag_range_args {
1226 /* start of the defrag operation */
1227 __u64 start;
1228
1229 /* number of bytes to defrag, use (u64)-1 to say all */
1230 __u64 len;
1231
1232 /*
1233 * flags for the operation, which can include turning
1234 * on compression for this one defrag
1235 */
1236 __u64 flags;
1237
1238 /*
1239 * any extent bigger than this will be considered
1240 * already defragged. Use 0 to take the kernel default
1241 * Use 1 to say every single extent must be rewritten
1242 */
1243 __u32 extent_thresh;
1244
1245 /*
1246 * which compression method to use if turning on compression
1247 * for this defrag operation. If unspecified, zlib will
1248 * be used
1249 */
1250 __u32 compress_type;
1251
1252 /* spare for later */
1253 __u32 unused[4];
1254};
1255
1256
1102/* 1257/*
1103 * inode items have the data typically returned from stat and store other 1258 * inode items have the data typically returned from stat and store other
1104 * info about object characteristics. There is one for every file and dir in 1259 * info about object characteristics. There is one for every file and dir in
@@ -1180,6 +1335,11 @@ struct btrfs_root {
1180 */ 1335 */
1181#define BTRFS_STRING_ITEM_KEY 253 1336#define BTRFS_STRING_ITEM_KEY 253
1182 1337
1338/*
1339 * Flags for mount options.
1340 *
1341 * Note: don't forget to add new options to btrfs_show_options()
1342 */
1183#define BTRFS_MOUNT_NODATASUM (1 << 0) 1343#define BTRFS_MOUNT_NODATASUM (1 << 0)
1184#define BTRFS_MOUNT_NODATACOW (1 << 1) 1344#define BTRFS_MOUNT_NODATACOW (1 << 1)
1185#define BTRFS_MOUNT_NOBARRIER (1 << 2) 1345#define BTRFS_MOUNT_NOBARRIER (1 << 2)
@@ -1192,6 +1352,12 @@ struct btrfs_root {
1192#define BTRFS_MOUNT_NOSSD (1 << 9) 1352#define BTRFS_MOUNT_NOSSD (1 << 9)
1193#define BTRFS_MOUNT_DISCARD (1 << 10) 1353#define BTRFS_MOUNT_DISCARD (1 << 10)
1194#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) 1354#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11)
1355#define BTRFS_MOUNT_SPACE_CACHE (1 << 12)
1356#define BTRFS_MOUNT_CLEAR_CACHE (1 << 13)
1357#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
1358#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1359#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1360#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1195 1361
1196#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1362#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1197#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1363#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1211,6 +1377,9 @@ struct btrfs_root {
1211#define BTRFS_INODE_NODUMP (1 << 8) 1377#define BTRFS_INODE_NODUMP (1 << 8)
1212#define BTRFS_INODE_NOATIME (1 << 9) 1378#define BTRFS_INODE_NOATIME (1 << 9)
1213#define BTRFS_INODE_DIRSYNC (1 << 10) 1379#define BTRFS_INODE_DIRSYNC (1 << 10)
1380#define BTRFS_INODE_COMPRESS (1 << 11)
1381
1382#define BTRFS_INODE_ROOT_ITEM_INIT (1 << 31)
1214 1383
1215/* some macros to generate set/get funcs for the struct fields. This 1384/* some macros to generate set/get funcs for the struct fields. This
1216 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1385 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -1364,26 +1533,12 @@ static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
1364 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); 1533 return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
1365} 1534}
1366 1535
1367static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
1368 struct btrfs_chunk *c, int nr,
1369 u64 val)
1370{
1371 btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
1372}
1373
1374static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, 1536static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
1375 struct btrfs_chunk *c, int nr) 1537 struct btrfs_chunk *c, int nr)
1376{ 1538{
1377 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); 1539 return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
1378} 1540}
1379 1541
1380static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
1381 struct btrfs_chunk *c, int nr,
1382 u64 val)
1383{
1384 btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
1385}
1386
1387/* struct btrfs_block_group_item */ 1542/* struct btrfs_block_group_item */
1388BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, 1543BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
1389 used, 64); 1544 used, 64);
@@ -1441,14 +1596,6 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
1441 return (struct btrfs_timespec *)ptr; 1596 return (struct btrfs_timespec *)ptr;
1442} 1597}
1443 1598
1444static inline struct btrfs_timespec *
1445btrfs_inode_otime(struct btrfs_inode_item *inode_item)
1446{
1447 unsigned long ptr = (unsigned long)inode_item;
1448 ptr += offsetof(struct btrfs_inode_item, otime);
1449 return (struct btrfs_timespec *)ptr;
1450}
1451
1452BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); 1599BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
1453BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); 1600BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
1454 1601
@@ -1665,6 +1812,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
1665 write_eb_member(eb, item, struct btrfs_dir_item, location, key); 1812 write_eb_member(eb, item, struct btrfs_dir_item, location, key);
1666} 1813}
1667 1814
1815BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
1816 num_entries, 64);
1817BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
1818 num_bitmaps, 64);
1819BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
1820 generation, 64);
1821
1822static inline void btrfs_free_space_key(struct extent_buffer *eb,
1823 struct btrfs_free_space_header *h,
1824 struct btrfs_disk_key *key)
1825{
1826 read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
1827}
1828
1829static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
1830 struct btrfs_free_space_header *h,
1831 struct btrfs_disk_key *key)
1832{
1833 write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
1834}
1835
1668/* struct btrfs_disk_key */ 1836/* struct btrfs_disk_key */
1669BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, 1837BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
1670 objectid, 64); 1838 objectid, 64);
@@ -1778,33 +1946,6 @@ static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
1778 return (u8 *)ptr; 1946 return (u8 *)ptr;
1779} 1947}
1780 1948
1781static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
1782{
1783 unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
1784 return (u8 *)ptr;
1785}
1786
1787static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
1788{
1789 unsigned long ptr = offsetof(struct btrfs_header, csum);
1790 return (u8 *)ptr;
1791}
1792
1793static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
1794{
1795 return NULL;
1796}
1797
1798static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
1799{
1800 return NULL;
1801}
1802
1803static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
1804{
1805 return NULL;
1806}
1807
1808static inline int btrfs_is_leaf(struct extent_buffer *eb) 1949static inline int btrfs_is_leaf(struct extent_buffer *eb)
1809{ 1950{
1810 return btrfs_header_level(eb) == 0; 1951 return btrfs_header_level(eb) == 0;
@@ -1829,6 +1970,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
1829BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, 1970BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
1830 last_snapshot, 64); 1971 last_snapshot, 64);
1831 1972
1973static inline bool btrfs_root_readonly(struct btrfs_root *root)
1974{
1975 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1976}
1977
1832/* struct btrfs_super_block */ 1978/* struct btrfs_super_block */
1833 1979
1834BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 1980BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -1876,6 +2022,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1876 incompat_flags, 64); 2022 incompat_flags, 64);
1877BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 2023BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
1878 csum_type, 16); 2024 csum_type, 16);
2025BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
2026 cache_generation, 64);
1879 2027
1880static inline int btrfs_super_csum_size(struct btrfs_super_block *s) 2028static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
1881{ 2029{
@@ -1951,22 +2099,6 @@ static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
1951 return sb->s_fs_info; 2099 return sb->s_fs_info;
1952} 2100}
1953 2101
1954static inline int btrfs_set_root_name(struct btrfs_root *root,
1955 const char *name, int len)
1956{
1957 /* if we already have a name just free it */
1958 kfree(root->name);
1959
1960 root->name = kmalloc(len+1, GFP_KERNEL);
1961 if (!root->name)
1962 return -ENOMEM;
1963
1964 memcpy(root->name, name, len);
1965 root->name[len] = '\0';
1966
1967 return 0;
1968}
1969
1970static inline u32 btrfs_level_size(struct btrfs_root *root, int level) 2102static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
1971{ 2103{
1972 if (level == 0) 2104 if (level == 0)
@@ -1988,7 +2120,20 @@ static inline struct dentry *fdentry(struct file *file)
1988 return file->f_path.dentry; 2120 return file->f_path.dentry;
1989} 2121}
1990 2122
2123static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2124{
2125 return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
2126 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2127}
2128
1991/* extent-tree.c */ 2129/* extent-tree.c */
2130static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2131 int num_items)
2132{
2133 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2134 3 * num_items;
2135}
2136
1992void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2137void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
1993int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2138int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
1994 struct btrfs_root *root, unsigned long count); 2139 struct btrfs_root *root, unsigned long count);
@@ -1998,12 +2143,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
1998 u64 num_bytes, u64 *refs, u64 *flags); 2143 u64 num_bytes, u64 *refs, u64 *flags);
1999int btrfs_pin_extent(struct btrfs_root *root, 2144int btrfs_pin_extent(struct btrfs_root *root,
2000 u64 bytenr, u64 num, int reserved); 2145 u64 bytenr, u64 num, int reserved);
2001int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
2002 struct btrfs_root *root, struct extent_buffer *leaf);
2003int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2146int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2004 struct btrfs_root *root, 2147 struct btrfs_root *root,
2005 u64 objectid, u64 offset, u64 bytenr); 2148 u64 objectid, u64 offset, u64 bytenr);
2006int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
2007struct btrfs_block_group_cache *btrfs_lookup_block_group( 2149struct btrfs_block_group_cache *btrfs_lookup_block_group(
2008 struct btrfs_fs_info *info, 2150 struct btrfs_fs_info *info,
2009 u64 bytenr); 2151 u64 bytenr);
@@ -2051,6 +2193,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2051 u64 root_objectid, u64 owner, u64 offset); 2193 u64 root_objectid, u64 owner, u64 offset);
2052 2194
2053int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2195int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2196int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
2197 u64 num_bytes, int reserve, int sinfo);
2054int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2198int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2055 struct btrfs_root *root); 2199 struct btrfs_root *root);
2056int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2200int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2073,13 +2217,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
2073int btrfs_remove_block_group(struct btrfs_trans_handle *trans, 2217int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
2074 struct btrfs_root *root, u64 group_start); 2218 struct btrfs_root *root, u64 group_start);
2075u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); 2219u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
2220u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
2076void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); 2221void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
2077void btrfs_clear_space_info_full(struct btrfs_fs_info *info); 2222void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
2078int btrfs_check_data_free_space(struct inode *inode, u64 bytes); 2223int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
2079void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); 2224void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
2080int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 2225int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
2081 struct btrfs_root *root, 2226 struct btrfs_root *root,
2082 int num_items, int *retries); 2227 int num_items);
2083void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 2228void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
2084 struct btrfs_root *root); 2229 struct btrfs_root *root);
2085int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, 2230int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2100,7 +2245,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
2100int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2101 struct btrfs_root *root, 2246 struct btrfs_root *root,
2102 struct btrfs_block_rsv *block_rsv, 2247 struct btrfs_block_rsv *block_rsv,
2103 u64 num_bytes, int *retries); 2248 u64 num_bytes);
2104int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
2105 struct btrfs_root *root, 2250 struct btrfs_root *root,
2106 struct btrfs_block_rsv *block_rsv, 2251 struct btrfs_block_rsv *block_rsv,
@@ -2111,10 +2256,24 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2111void btrfs_block_rsv_release(struct btrfs_root *root, 2256void btrfs_block_rsv_release(struct btrfs_root *root,
2112 struct btrfs_block_rsv *block_rsv, 2257 struct btrfs_block_rsv *block_rsv,
2113 u64 num_bytes); 2258 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2114int btrfs_set_block_group_ro(struct btrfs_root *root, 2262int btrfs_set_block_group_ro(struct btrfs_root *root,
2115 struct btrfs_block_group_cache *cache); 2263 struct btrfs_block_group_cache *cache);
2116int btrfs_set_block_group_rw(struct btrfs_root *root, 2264int btrfs_set_block_group_rw(struct btrfs_root *root,
2117 struct btrfs_block_group_cache *cache); 2265 struct btrfs_block_group_cache *cache);
2266void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
2267u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
2268int btrfs_error_unpin_extent_range(struct btrfs_root *root,
2269 u64 start, u64 end);
2270int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
2271 u64 num_bytes, u64 *actual_bytes);
2272int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
2273 struct btrfs_root *root, u64 type);
2274int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
2275
2276int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
2118/* ctree.c */ 2277/* ctree.c */
2119int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, 2278int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
2120 int level, int *slot); 2279 int level, int *slot);
@@ -2166,10 +2325,12 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
2166 struct btrfs_root *root, struct extent_buffer *parent, 2325 struct btrfs_root *root, struct extent_buffer *parent,
2167 int start_slot, int cache_only, u64 *last_ret, 2326 int start_slot, int cache_only, u64 *last_ret,
2168 struct btrfs_key *progress); 2327 struct btrfs_key *progress);
2169void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); 2328void btrfs_release_path(struct btrfs_path *p);
2170struct btrfs_path *btrfs_alloc_path(void); 2329struct btrfs_path *btrfs_alloc_path(void);
2171void btrfs_free_path(struct btrfs_path *p); 2330void btrfs_free_path(struct btrfs_path *p);
2172void btrfs_set_path_blocking(struct btrfs_path *p); 2331void btrfs_set_path_blocking(struct btrfs_path *p);
2332void btrfs_clear_path_blocking(struct btrfs_path *p,
2333 struct extent_buffer *held);
2173void btrfs_unlock_up_safe(struct btrfs_path *p, int level); 2334void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
2174 2335
2175int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, 2336int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2181,13 +2342,12 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
2181 return btrfs_del_items(trans, root, path, path->slots[0], 1); 2342 return btrfs_del_items(trans, root, path, path->slots[0], 1);
2182} 2343}
2183 2344
2345int setup_items_for_insert(struct btrfs_trans_handle *trans,
2346 struct btrfs_root *root, struct btrfs_path *path,
2347 struct btrfs_key *cpu_key, u32 *data_size,
2348 u32 total_data, u32 total_size, int nr);
2184int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root 2349int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
2185 *root, struct btrfs_key *key, void *data, u32 data_size); 2350 *root, struct btrfs_key *key, void *data, u32 data_size);
2186int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
2187 struct btrfs_root *root,
2188 struct btrfs_path *path,
2189 struct btrfs_key *cpu_key, u32 *data_size,
2190 int nr);
2191int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, 2351int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
2192 struct btrfs_root *root, 2352 struct btrfs_root *root,
2193 struct btrfs_path *path, 2353 struct btrfs_path *path,
@@ -2211,6 +2371,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
2211 struct btrfs_root *root, 2371 struct btrfs_root *root,
2212 struct extent_buffer *node, 2372 struct extent_buffer *node,
2213 struct extent_buffer *parent); 2373 struct extent_buffer *parent);
2374static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2375{
2376 /*
2377 * Get synced with close_ctree()
2378 */
2379 smp_mb();
2380 return fs_info->closing;
2381}
2382
2214/* root-item.c */ 2383/* root-item.c */
2215int btrfs_find_root_ref(struct btrfs_root *tree_root, 2384int btrfs_find_root_ref(struct btrfs_root *tree_root,
2216 struct btrfs_path *path, 2385 struct btrfs_path *path,
@@ -2233,16 +2402,16 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
2233 *item); 2402 *item);
2234int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct 2403int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
2235 btrfs_root_item *item, struct btrfs_key *key); 2404 btrfs_root_item *item, struct btrfs_key *key);
2236int btrfs_search_root(struct btrfs_root *root, u64 search_start,
2237 u64 *found_objectid);
2238int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid); 2405int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
2239int btrfs_find_orphan_roots(struct btrfs_root *tree_root); 2406int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
2240int btrfs_set_root_node(struct btrfs_root_item *item, 2407int btrfs_set_root_node(struct btrfs_root_item *item,
2241 struct extent_buffer *node); 2408 struct extent_buffer *node);
2409void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
2410
2242/* dir-item.c */ 2411/* dir-item.c */
2243int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, 2412int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
2244 struct btrfs_root *root, const char *name, 2413 struct btrfs_root *root, const char *name,
2245 int name_len, u64 dir, 2414 int name_len, struct inode *dir,
2246 struct btrfs_key *location, u8 type, u64 index); 2415 struct btrfs_key *location, u8 type, u64 index);
2247struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, 2416struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
2248 struct btrfs_root *root, 2417 struct btrfs_root *root,
@@ -2276,6 +2445,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
2276 struct btrfs_path *path, u64 dir, 2445 struct btrfs_path *path, u64 dir,
2277 const char *name, u16 name_len, 2446 const char *name, u16 name_len,
2278 int mod); 2447 int mod);
2448int verify_dir_item(struct btrfs_root *root,
2449 struct extent_buffer *leaf,
2450 struct btrfs_dir_item *dir_item);
2279 2451
2280/* orphan.c */ 2452/* orphan.c */
2281int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, 2453int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -2284,12 +2456,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
2284 struct btrfs_root *root, u64 offset); 2456 struct btrfs_root *root, u64 offset);
2285int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); 2457int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
2286 2458
2287/* inode-map.c */
2288int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
2289 struct btrfs_root *fs_root,
2290 u64 dirid, u64 *objectid);
2291int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
2292
2293/* inode-item.c */ 2459/* inode-item.c */
2294int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, 2460int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
2295 struct btrfs_root *root, 2461 struct btrfs_root *root,
@@ -2334,8 +2500,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
2334 struct btrfs_ordered_sum *sums); 2500 struct btrfs_ordered_sum *sums);
2335int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, 2501int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
2336 struct bio *bio, u64 file_start, int contig); 2502 struct bio *bio, u64 file_start, int contig);
2337int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
2338 u64 start, unsigned long len);
2339struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, 2503struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
2340 struct btrfs_root *root, 2504 struct btrfs_root *root,
2341 struct btrfs_path *path, 2505 struct btrfs_path *path,
@@ -2343,8 +2507,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
2343int btrfs_csum_truncate(struct btrfs_trans_handle *trans, 2507int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
2344 struct btrfs_root *root, struct btrfs_path *path, 2508 struct btrfs_root *root, struct btrfs_path *path,
2345 u64 isize); 2509 u64 isize);
2346int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, 2510int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
2347 u64 end, struct list_head *list); 2511 struct list_head *list, int search_commit);
2348/* inode.c */ 2512/* inode.c */
2349 2513
2350/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ 2514/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
@@ -2373,14 +2537,12 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2373 u32 min_type); 2537 u32 min_type);
2374 2538
2375int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2539int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2376int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
2377int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, 2540int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2378 struct extent_state **cached_state); 2541 struct extent_state **cached_state);
2379int btrfs_writepages(struct address_space *mapping, 2542int btrfs_writepages(struct address_space *mapping,
2380 struct writeback_control *wbc); 2543 struct writeback_control *wbc);
2381int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2544int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
2382 struct btrfs_root *new_root, 2545 struct btrfs_root *new_root, u64 new_dirid);
2383 u64 new_dirid, u64 alloc_hint);
2384int btrfs_merge_bio_hook(struct page *page, unsigned long offset, 2546int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
2385 size_t size, struct bio *bio, unsigned long bio_flags); 2547 size_t size, struct bio *bio, unsigned long bio_flags);
2386 2548
@@ -2390,9 +2552,8 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
2390int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2552int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2391int btrfs_readpage(struct file *file, struct page *page); 2553int btrfs_readpage(struct file *file, struct page *page);
2392void btrfs_evict_inode(struct inode *inode); 2554void btrfs_evict_inode(struct inode *inode);
2393void btrfs_put_inode(struct inode *inode);
2394int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); 2555int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
2395void btrfs_dirty_inode(struct inode *inode); 2556void btrfs_dirty_inode(struct inode *inode, int flags);
2396struct inode *btrfs_alloc_inode(struct super_block *sb); 2557struct inode *btrfs_alloc_inode(struct super_block *sb);
2397void btrfs_destroy_inode(struct inode *inode); 2558void btrfs_destroy_inode(struct inode *inode);
2398int btrfs_drop_inode(struct inode *inode); 2559int btrfs_drop_inode(struct inode *inode);
@@ -2401,17 +2562,15 @@ void btrfs_destroy_cachep(void);
2401long btrfs_ioctl_trans_end(struct file *file); 2562long btrfs_ioctl_trans_end(struct file *file);
2402struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2563struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2403 struct btrfs_root *root, int *was_new); 2564 struct btrfs_root *root, int *was_new);
2404int btrfs_commit_write(struct file *file, struct page *page,
2405 unsigned from, unsigned to);
2406struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2565struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
2407 size_t page_offset, u64 start, u64 end, 2566 size_t pg_offset, u64 start, u64 end,
2408 int create); 2567 int create);
2409int btrfs_update_inode(struct btrfs_trans_handle *trans, 2568int btrfs_update_inode(struct btrfs_trans_handle *trans,
2410 struct btrfs_root *root, 2569 struct btrfs_root *root,
2411 struct inode *inode); 2570 struct inode *inode);
2412int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2571int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2413int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2572int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2414void btrfs_orphan_cleanup(struct btrfs_root *root); 2573int btrfs_orphan_cleanup(struct btrfs_root *root);
2415void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans, 2574void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2416 struct btrfs_pending_snapshot *pending, 2575 struct btrfs_pending_snapshot *pending,
2417 u64 *bytes_to_reserve); 2576 u64 *bytes_to_reserve);
@@ -2419,31 +2578,44 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2419 struct btrfs_pending_snapshot *pending); 2578 struct btrfs_pending_snapshot *pending);
2420void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2579void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2421 struct btrfs_root *root); 2580 struct btrfs_root *root);
2422int btrfs_cont_expand(struct inode *inode, loff_t size); 2581int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
2423int btrfs_invalidate_inodes(struct btrfs_root *root); 2582int btrfs_invalidate_inodes(struct btrfs_root *root);
2424void btrfs_add_delayed_iput(struct inode *inode); 2583void btrfs_add_delayed_iput(struct inode *inode);
2425void btrfs_run_delayed_iputs(struct btrfs_root *root); 2584void btrfs_run_delayed_iputs(struct btrfs_root *root);
2426int btrfs_prealloc_file_range(struct inode *inode, int mode, 2585int btrfs_prealloc_file_range(struct inode *inode, int mode,
2427 u64 start, u64 num_bytes, u64 min_size, 2586 u64 start, u64 num_bytes, u64 min_size,
2428 loff_t actual_len, u64 *alloc_hint); 2587 loff_t actual_len, u64 *alloc_hint);
2588int btrfs_prealloc_file_range_trans(struct inode *inode,
2589 struct btrfs_trans_handle *trans, int mode,
2590 u64 start, u64 num_bytes, u64 min_size,
2591 loff_t actual_len, u64 *alloc_hint);
2429extern const struct dentry_operations btrfs_dentry_operations; 2592extern const struct dentry_operations btrfs_dentry_operations;
2430 2593
2431/* ioctl.c */ 2594/* ioctl.c */
2432long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); 2595long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
2433void btrfs_update_iflags(struct inode *inode); 2596void btrfs_update_iflags(struct inode *inode);
2434void btrfs_inherit_iflags(struct inode *inode, struct inode *dir); 2597void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
2435 2598int btrfs_defrag_file(struct inode *inode, struct file *file,
2599 struct btrfs_ioctl_defrag_range_args *range,
2600 u64 newer_than, unsigned long max_pages);
2436/* file.c */ 2601/* file.c */
2602int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
2603 struct inode *inode);
2604int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
2437int btrfs_sync_file(struct file *file, int datasync); 2605int btrfs_sync_file(struct file *file, int datasync);
2438int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, 2606int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
2439 int skip_pinned); 2607 int skip_pinned);
2440int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
2441extern const struct file_operations btrfs_file_operations; 2608extern const struct file_operations btrfs_file_operations;
2442int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, 2609int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
2443 u64 start, u64 end, u64 *hint_byte, int drop_cache); 2610 u64 start, u64 end, u64 *hint_byte, int drop_cache);
2444int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, 2611int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
2445 struct inode *inode, u64 start, u64 end); 2612 struct inode *inode, u64 start, u64 end);
2446int btrfs_release_file(struct inode *inode, struct file *file); 2613int btrfs_release_file(struct inode *inode, struct file *file);
2614void btrfs_drop_pages(struct page **pages, size_t num_pages);
2615int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
2616 struct page **pages, size_t num_pages,
2617 loff_t pos, size_t write_bytes,
2618 struct extent_state **cached);
2447 2619
2448/* tree-defrag.c */ 2620/* tree-defrag.c */
2449int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, 2621int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -2452,10 +2624,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
2452/* sysfs.c */ 2624/* sysfs.c */
2453int btrfs_init_sysfs(void); 2625int btrfs_init_sysfs(void);
2454void btrfs_exit_sysfs(void); 2626void btrfs_exit_sysfs(void);
2455int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
2456int btrfs_sysfs_add_root(struct btrfs_root *root);
2457void btrfs_sysfs_del_root(struct btrfs_root *root);
2458void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2459 2627
2460/* xattr.c */ 2628/* xattr.c */
2461ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 2629ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -2463,10 +2631,18 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2463/* super.c */ 2631/* super.c */
2464int btrfs_parse_options(struct btrfs_root *root, char *options); 2632int btrfs_parse_options(struct btrfs_root *root, char *options);
2465int btrfs_sync_fs(struct super_block *sb, int wait); 2633int btrfs_sync_fs(struct super_block *sb, int wait);
2634void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
2635 unsigned int line, int errno);
2636
2637#define btrfs_std_error(fs_info, errno) \
2638do { \
2639 if ((errno)) \
2640 __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
2641} while (0)
2466 2642
2467/* acl.c */ 2643/* acl.c */
2468#ifdef CONFIG_BTRFS_FS_POSIX_ACL 2644#ifdef CONFIG_BTRFS_FS_POSIX_ACL
2469int btrfs_check_acl(struct inode *inode, int mask); 2645int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
2470#else 2646#else
2471#define btrfs_check_acl NULL 2647#define btrfs_check_acl NULL
2472#endif 2648#endif
@@ -2490,4 +2666,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
2490 u64 *bytes_to_reserve); 2666 u64 *bytes_to_reserve);
2491void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, 2667void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
2492 struct btrfs_pending_snapshot *pending); 2668 struct btrfs_pending_snapshot *pending);
2669
2670/* scrub.c */
2671int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
2672 struct btrfs_scrub_progress *progress, int readonly);
2673int btrfs_scrub_pause(struct btrfs_root *root);
2674int btrfs_scrub_pause_super(struct btrfs_root *root);
2675int btrfs_scrub_continue(struct btrfs_root *root);
2676int btrfs_scrub_continue_super(struct btrfs_root *root);
2677int btrfs_scrub_cancel(struct btrfs_root *root);
2678int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
2679int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2680int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2681 struct btrfs_scrub_progress *progress);
2682
2493#endif 2683#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
new file mode 100644
index 000000000000..98c68e658a9b
--- /dev/null
+++ b/fs/btrfs/delayed-inode.c
@@ -0,0 +1,1773 @@
1/*
2 * Copyright (C) 2011 Fujitsu. All rights reserved.
3 * Written by Miao Xie <miaox@cn.fujitsu.com>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#include <linux/slab.h>
21#include "delayed-inode.h"
22#include "disk-io.h"
23#include "transaction.h"
24
25#define BTRFS_DELAYED_WRITEBACK 400
26#define BTRFS_DELAYED_BACKGROUND 100
27
28static struct kmem_cache *delayed_node_cache;
29
30int __init btrfs_delayed_inode_init(void)
31{
32 delayed_node_cache = kmem_cache_create("delayed_node",
33 sizeof(struct btrfs_delayed_node),
34 0,
35 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
36 NULL);
37 if (!delayed_node_cache)
38 return -ENOMEM;
39 return 0;
40}
41
42void btrfs_delayed_inode_exit(void)
43{
44 if (delayed_node_cache)
45 kmem_cache_destroy(delayed_node_cache);
46}
47
48static inline void btrfs_init_delayed_node(
49 struct btrfs_delayed_node *delayed_node,
50 struct btrfs_root *root, u64 inode_id)
51{
52 delayed_node->root = root;
53 delayed_node->inode_id = inode_id;
54 atomic_set(&delayed_node->refs, 0);
55 delayed_node->count = 0;
56 delayed_node->in_list = 0;
57 delayed_node->inode_dirty = 0;
58 delayed_node->ins_root = RB_ROOT;
59 delayed_node->del_root = RB_ROOT;
60 mutex_init(&delayed_node->mutex);
61 delayed_node->index_cnt = 0;
62 INIT_LIST_HEAD(&delayed_node->n_list);
63 INIT_LIST_HEAD(&delayed_node->p_list);
64 delayed_node->bytes_reserved = 0;
65}
66
67static inline int btrfs_is_continuous_delayed_item(
68 struct btrfs_delayed_item *item1,
69 struct btrfs_delayed_item *item2)
70{
71 if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
72 item1->key.objectid == item2->key.objectid &&
73 item1->key.type == item2->key.type &&
74 item1->key.offset + 1 == item2->key.offset)
75 return 1;
76 return 0;
77}
78
79static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
80 struct btrfs_root *root)
81{
82 return root->fs_info->delayed_root;
83}
84
85static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
86{
87 struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
88 struct btrfs_root *root = btrfs_inode->root;
89 u64 ino = btrfs_ino(inode);
90 struct btrfs_delayed_node *node;
91
92 node = ACCESS_ONCE(btrfs_inode->delayed_node);
93 if (node) {
94 atomic_inc(&node->refs);
95 return node;
96 }
97
98 spin_lock(&root->inode_lock);
99 node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
100 if (node) {
101 if (btrfs_inode->delayed_node) {
102 atomic_inc(&node->refs); /* can be accessed */
103 BUG_ON(btrfs_inode->delayed_node != node);
104 spin_unlock(&root->inode_lock);
105 return node;
106 }
107 btrfs_inode->delayed_node = node;
108 atomic_inc(&node->refs); /* can be accessed */
109 atomic_inc(&node->refs); /* cached in the inode */
110 spin_unlock(&root->inode_lock);
111 return node;
112 }
113 spin_unlock(&root->inode_lock);
114
115 return NULL;
116}
117
118static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
119 struct inode *inode)
120{
121 struct btrfs_delayed_node *node;
122 struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
123 struct btrfs_root *root = btrfs_inode->root;
124 u64 ino = btrfs_ino(inode);
125 int ret;
126
127again:
128 node = btrfs_get_delayed_node(inode);
129 if (node)
130 return node;
131
132 node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
133 if (!node)
134 return ERR_PTR(-ENOMEM);
135 btrfs_init_delayed_node(node, root, ino);
136
137 atomic_inc(&node->refs); /* cached in the btrfs inode */
138 atomic_inc(&node->refs); /* can be accessed */
139
140 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
141 if (ret) {
142 kmem_cache_free(delayed_node_cache, node);
143 return ERR_PTR(ret);
144 }
145
146 spin_lock(&root->inode_lock);
147 ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
148 if (ret == -EEXIST) {
149 kmem_cache_free(delayed_node_cache, node);
150 spin_unlock(&root->inode_lock);
151 radix_tree_preload_end();
152 goto again;
153 }
154 btrfs_inode->delayed_node = node;
155 spin_unlock(&root->inode_lock);
156 radix_tree_preload_end();
157
158 return node;
159}
160
161/*
162 * Call it when holding delayed_node->mutex
163 *
164 * If mod = 1, add this node into the prepared list.
165 */
166static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
167 struct btrfs_delayed_node *node,
168 int mod)
169{
170 spin_lock(&root->lock);
171 if (node->in_list) {
172 if (!list_empty(&node->p_list))
173 list_move_tail(&node->p_list, &root->prepare_list);
174 else if (mod)
175 list_add_tail(&node->p_list, &root->prepare_list);
176 } else {
177 list_add_tail(&node->n_list, &root->node_list);
178 list_add_tail(&node->p_list, &root->prepare_list);
179 atomic_inc(&node->refs); /* inserted into list */
180 root->nodes++;
181 node->in_list = 1;
182 }
183 spin_unlock(&root->lock);
184}
185
186/* Call it when holding delayed_node->mutex */
187static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
188 struct btrfs_delayed_node *node)
189{
190 spin_lock(&root->lock);
191 if (node->in_list) {
192 root->nodes--;
193 atomic_dec(&node->refs); /* not in the list */
194 list_del_init(&node->n_list);
195 if (!list_empty(&node->p_list))
196 list_del_init(&node->p_list);
197 node->in_list = 0;
198 }
199 spin_unlock(&root->lock);
200}
201
202struct btrfs_delayed_node *btrfs_first_delayed_node(
203 struct btrfs_delayed_root *delayed_root)
204{
205 struct list_head *p;
206 struct btrfs_delayed_node *node = NULL;
207
208 spin_lock(&delayed_root->lock);
209 if (list_empty(&delayed_root->node_list))
210 goto out;
211
212 p = delayed_root->node_list.next;
213 node = list_entry(p, struct btrfs_delayed_node, n_list);
214 atomic_inc(&node->refs);
215out:
216 spin_unlock(&delayed_root->lock);
217
218 return node;
219}
220
221struct btrfs_delayed_node *btrfs_next_delayed_node(
222 struct btrfs_delayed_node *node)
223{
224 struct btrfs_delayed_root *delayed_root;
225 struct list_head *p;
226 struct btrfs_delayed_node *next = NULL;
227
228 delayed_root = node->root->fs_info->delayed_root;
229 spin_lock(&delayed_root->lock);
230 if (!node->in_list) { /* not in the list */
231 if (list_empty(&delayed_root->node_list))
232 goto out;
233 p = delayed_root->node_list.next;
234 } else if (list_is_last(&node->n_list, &delayed_root->node_list))
235 goto out;
236 else
237 p = node->n_list.next;
238
239 next = list_entry(p, struct btrfs_delayed_node, n_list);
240 atomic_inc(&next->refs);
241out:
242 spin_unlock(&delayed_root->lock);
243
244 return next;
245}
246
247static void __btrfs_release_delayed_node(
248 struct btrfs_delayed_node *delayed_node,
249 int mod)
250{
251 struct btrfs_delayed_root *delayed_root;
252
253 if (!delayed_node)
254 return;
255
256 delayed_root = delayed_node->root->fs_info->delayed_root;
257
258 mutex_lock(&delayed_node->mutex);
259 if (delayed_node->count)
260 btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
261 else
262 btrfs_dequeue_delayed_node(delayed_root, delayed_node);
263 mutex_unlock(&delayed_node->mutex);
264
265 if (atomic_dec_and_test(&delayed_node->refs)) {
266 struct btrfs_root *root = delayed_node->root;
267 spin_lock(&root->inode_lock);
268 if (atomic_read(&delayed_node->refs) == 0) {
269 radix_tree_delete(&root->delayed_nodes_tree,
270 delayed_node->inode_id);
271 kmem_cache_free(delayed_node_cache, delayed_node);
272 }
273 spin_unlock(&root->inode_lock);
274 }
275}
276
277static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
278{
279 __btrfs_release_delayed_node(node, 0);
280}
281
282struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
283 struct btrfs_delayed_root *delayed_root)
284{
285 struct list_head *p;
286 struct btrfs_delayed_node *node = NULL;
287
288 spin_lock(&delayed_root->lock);
289 if (list_empty(&delayed_root->prepare_list))
290 goto out;
291
292 p = delayed_root->prepare_list.next;
293 list_del_init(p);
294 node = list_entry(p, struct btrfs_delayed_node, p_list);
295 atomic_inc(&node->refs);
296out:
297 spin_unlock(&delayed_root->lock);
298
299 return node;
300}
301
302static inline void btrfs_release_prepared_delayed_node(
303 struct btrfs_delayed_node *node)
304{
305 __btrfs_release_delayed_node(node, 1);
306}
307
308struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
309{
310 struct btrfs_delayed_item *item;
311 item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
312 if (item) {
313 item->data_len = data_len;
314 item->ins_or_del = 0;
315 item->bytes_reserved = 0;
316 item->delayed_node = NULL;
317 atomic_set(&item->refs, 1);
318 }
319 return item;
320}
321
322/*
323 * __btrfs_lookup_delayed_item - look up the delayed item by key
324 * @delayed_node: pointer to the delayed node
325 * @key: the key to look up
326 * @prev: used to store the prev item if the right item isn't found
327 * @next: used to store the next item if the right item isn't found
328 *
329 * Note: if we don't find the right item, we will return the prev item and
330 * the next item.
331 */
332static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
333 struct rb_root *root,
334 struct btrfs_key *key,
335 struct btrfs_delayed_item **prev,
336 struct btrfs_delayed_item **next)
337{
338 struct rb_node *node, *prev_node = NULL;
339 struct btrfs_delayed_item *delayed_item = NULL;
340 int ret = 0;
341
342 node = root->rb_node;
343
344 while (node) {
345 delayed_item = rb_entry(node, struct btrfs_delayed_item,
346 rb_node);
347 prev_node = node;
348 ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
349 if (ret < 0)
350 node = node->rb_right;
351 else if (ret > 0)
352 node = node->rb_left;
353 else
354 return delayed_item;
355 }
356
357 if (prev) {
358 if (!prev_node)
359 *prev = NULL;
360 else if (ret < 0)
361 *prev = delayed_item;
362 else if ((node = rb_prev(prev_node)) != NULL) {
363 *prev = rb_entry(node, struct btrfs_delayed_item,
364 rb_node);
365 } else
366 *prev = NULL;
367 }
368
369 if (next) {
370 if (!prev_node)
371 *next = NULL;
372 else if (ret > 0)
373 *next = delayed_item;
374 else if ((node = rb_next(prev_node)) != NULL) {
375 *next = rb_entry(node, struct btrfs_delayed_item,
376 rb_node);
377 } else
378 *next = NULL;
379 }
380 return NULL;
381}
382
383struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
384 struct btrfs_delayed_node *delayed_node,
385 struct btrfs_key *key)
386{
387 struct btrfs_delayed_item *item;
388
389 item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
390 NULL, NULL);
391 return item;
392}
393
394struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
395 struct btrfs_delayed_node *delayed_node,
396 struct btrfs_key *key)
397{
398 struct btrfs_delayed_item *item;
399
400 item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
401 NULL, NULL);
402 return item;
403}
404
405struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
406 struct btrfs_delayed_node *delayed_node,
407 struct btrfs_key *key)
408{
409 struct btrfs_delayed_item *item, *next;
410
411 item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
412 NULL, &next);
413 if (!item)
414 item = next;
415
416 return item;
417}
418
419struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item(
420 struct btrfs_delayed_node *delayed_node,
421 struct btrfs_key *key)
422{
423 struct btrfs_delayed_item *item, *next;
424
425 item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
426 NULL, &next);
427 if (!item)
428 item = next;
429
430 return item;
431}
432
433static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
434 struct btrfs_delayed_item *ins,
435 int action)
436{
437 struct rb_node **p, *node;
438 struct rb_node *parent_node = NULL;
439 struct rb_root *root;
440 struct btrfs_delayed_item *item;
441 int cmp;
442
443 if (action == BTRFS_DELAYED_INSERTION_ITEM)
444 root = &delayed_node->ins_root;
445 else if (action == BTRFS_DELAYED_DELETION_ITEM)
446 root = &delayed_node->del_root;
447 else
448 BUG();
449 p = &root->rb_node;
450 node = &ins->rb_node;
451
452 while (*p) {
453 parent_node = *p;
454 item = rb_entry(parent_node, struct btrfs_delayed_item,
455 rb_node);
456
457 cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
458 if (cmp < 0)
459 p = &(*p)->rb_right;
460 else if (cmp > 0)
461 p = &(*p)->rb_left;
462 else
463 return -EEXIST;
464 }
465
466 rb_link_node(node, parent_node, p);
467 rb_insert_color(node, root);
468 ins->delayed_node = delayed_node;
469 ins->ins_or_del = action;
470
471 if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
472 action == BTRFS_DELAYED_INSERTION_ITEM &&
473 ins->key.offset >= delayed_node->index_cnt)
474 delayed_node->index_cnt = ins->key.offset + 1;
475
476 delayed_node->count++;
477 atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
478 return 0;
479}
480
481static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
482 struct btrfs_delayed_item *item)
483{
484 return __btrfs_add_delayed_item(node, item,
485 BTRFS_DELAYED_INSERTION_ITEM);
486}
487
488static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
489 struct btrfs_delayed_item *item)
490{
491 return __btrfs_add_delayed_item(node, item,
492 BTRFS_DELAYED_DELETION_ITEM);
493}
494
495static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
496{
497 struct rb_root *root;
498 struct btrfs_delayed_root *delayed_root;
499
500 delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
501
502 BUG_ON(!delayed_root);
503 BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
504 delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
505
506 if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
507 root = &delayed_item->delayed_node->ins_root;
508 else
509 root = &delayed_item->delayed_node->del_root;
510
511 rb_erase(&delayed_item->rb_node, root);
512 delayed_item->delayed_node->count--;
513 atomic_dec(&delayed_root->items);
514 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
515 waitqueue_active(&delayed_root->wait))
516 wake_up(&delayed_root->wait);
517}
518
519static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
520{
521 if (item) {
522 __btrfs_remove_delayed_item(item);
523 if (atomic_dec_and_test(&item->refs))
524 kfree(item);
525 }
526}
527
528struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
529 struct btrfs_delayed_node *delayed_node)
530{
531 struct rb_node *p;
532 struct btrfs_delayed_item *item = NULL;
533
534 p = rb_first(&delayed_node->ins_root);
535 if (p)
536 item = rb_entry(p, struct btrfs_delayed_item, rb_node);
537
538 return item;
539}
540
541struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
542 struct btrfs_delayed_node *delayed_node)
543{
544 struct rb_node *p;
545 struct btrfs_delayed_item *item = NULL;
546
547 p = rb_first(&delayed_node->del_root);
548 if (p)
549 item = rb_entry(p, struct btrfs_delayed_item, rb_node);
550
551 return item;
552}
553
554struct btrfs_delayed_item *__btrfs_next_delayed_item(
555 struct btrfs_delayed_item *item)
556{
557 struct rb_node *p;
558 struct btrfs_delayed_item *next = NULL;
559
560 p = rb_next(&item->rb_node);
561 if (p)
562 next = rb_entry(p, struct btrfs_delayed_item, rb_node);
563
564 return next;
565}
566
567static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
568 u64 root_id)
569{
570 struct btrfs_key root_key;
571
572 if (root->objectid == root_id)
573 return root;
574
575 root_key.objectid = root_id;
576 root_key.type = BTRFS_ROOT_ITEM_KEY;
577 root_key.offset = (u64)-1;
578 return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
579}
580
581static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
582 struct btrfs_root *root,
583 struct btrfs_delayed_item *item)
584{
585 struct btrfs_block_rsv *src_rsv;
586 struct btrfs_block_rsv *dst_rsv;
587 u64 num_bytes;
588 int ret;
589
590 if (!trans->bytes_reserved)
591 return 0;
592
593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv;
595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
598 if (!ret)
599 item->bytes_reserved = num_bytes;
600
601 return ret;
602}
603
604static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
605 struct btrfs_delayed_item *item)
606{
607 struct btrfs_block_rsv *rsv;
608
609 if (!item->bytes_reserved)
610 return;
611
612 rsv = &root->fs_info->global_block_rsv;
613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved);
615}
616
617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root,
620 struct btrfs_delayed_node *node)
621{
622 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes;
625 int ret;
626
627 if (!trans->bytes_reserved)
628 return 0;
629
630 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv;
632
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
635 if (!ret)
636 node->bytes_reserved = num_bytes;
637
638 return ret;
639}
640
641static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
642 struct btrfs_delayed_node *node)
643{
644 struct btrfs_block_rsv *rsv;
645
646 if (!node->bytes_reserved)
647 return;
648
649 rsv = &root->fs_info->global_block_rsv;
650 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved);
652 node->bytes_reserved = 0;
653}
654
655/*
656 * This helper will insert some continuous items into the same leaf according
657 * to the free space of the leaf.
658 */
659static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
660 struct btrfs_root *root,
661 struct btrfs_path *path,
662 struct btrfs_delayed_item *item)
663{
664 struct btrfs_delayed_item *curr, *next;
665 int free_space;
666 int total_data_size = 0, total_size = 0;
667 struct extent_buffer *leaf;
668 char *data_ptr;
669 struct btrfs_key *keys;
670 u32 *data_size;
671 struct list_head head;
672 int slot;
673 int nitems;
674 int i;
675 int ret = 0;
676
677 BUG_ON(!path->nodes[0]);
678
679 leaf = path->nodes[0];
680 free_space = btrfs_leaf_free_space(root, leaf);
681 INIT_LIST_HEAD(&head);
682
683 next = item;
684 nitems = 0;
685
686 /*
687 * count the number of the continuous items that we can insert in batch
688 */
689 while (total_size + next->data_len + sizeof(struct btrfs_item) <=
690 free_space) {
691 total_data_size += next->data_len;
692 total_size += next->data_len + sizeof(struct btrfs_item);
693 list_add_tail(&next->tree_list, &head);
694 nitems++;
695
696 curr = next;
697 next = __btrfs_next_delayed_item(curr);
698 if (!next)
699 break;
700
701 if (!btrfs_is_continuous_delayed_item(curr, next))
702 break;
703 }
704
705 if (!nitems) {
706 ret = 0;
707 goto out;
708 }
709
710 /*
711 * we need allocate some memory space, but it might cause the task
712 * to sleep, so we set all locked nodes in the path to blocking locks
713 * first.
714 */
715 btrfs_set_path_blocking(path);
716
717 keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
718 if (!keys) {
719 ret = -ENOMEM;
720 goto out;
721 }
722
723 data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
724 if (!data_size) {
725 ret = -ENOMEM;
726 goto error;
727 }
728
729 /* get keys of all the delayed items */
730 i = 0;
731 list_for_each_entry(next, &head, tree_list) {
732 keys[i] = next->key;
733 data_size[i] = next->data_len;
734 i++;
735 }
736
737 /* reset all the locked nodes in the patch to spinning locks. */
738 btrfs_clear_path_blocking(path, NULL);
739
740 /* insert the keys of the items */
741 ret = setup_items_for_insert(trans, root, path, keys, data_size,
742 total_data_size, total_size, nitems);
743 if (ret)
744 goto error;
745
746 /* insert the dir index items */
747 slot = path->slots[0];
748 list_for_each_entry_safe(curr, next, &head, tree_list) {
749 data_ptr = btrfs_item_ptr(leaf, slot, char);
750 write_extent_buffer(leaf, &curr->data,
751 (unsigned long)data_ptr,
752 curr->data_len);
753 slot++;
754
755 btrfs_delayed_item_release_metadata(root, curr);
756
757 list_del(&curr->tree_list);
758 btrfs_release_delayed_item(curr);
759 }
760
761error:
762 kfree(data_size);
763 kfree(keys);
764out:
765 return ret;
766}
767
768/*
769 * This helper can just do simple insertion that needn't extend item for new
770 * data, such as directory name index insertion, inode insertion.
771 */
772static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
773 struct btrfs_root *root,
774 struct btrfs_path *path,
775 struct btrfs_delayed_item *delayed_item)
776{
777 struct extent_buffer *leaf;
778 struct btrfs_item *item;
779 char *ptr;
780 int ret;
781
782 ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
783 delayed_item->data_len);
784 if (ret < 0 && ret != -EEXIST)
785 return ret;
786
787 leaf = path->nodes[0];
788
789 item = btrfs_item_nr(leaf, path->slots[0]);
790 ptr = btrfs_item_ptr(leaf, path->slots[0], char);
791
792 write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
793 delayed_item->data_len);
794 btrfs_mark_buffer_dirty(leaf);
795
796 btrfs_delayed_item_release_metadata(root, delayed_item);
797 return 0;
798}
799
800/*
801 * we insert an item first, then if there are some continuous items, we try
802 * to insert those items into the same leaf.
803 */
804static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
805 struct btrfs_path *path,
806 struct btrfs_root *root,
807 struct btrfs_delayed_node *node)
808{
809 struct btrfs_delayed_item *curr, *prev;
810 int ret = 0;
811
812do_again:
813 mutex_lock(&node->mutex);
814 curr = __btrfs_first_delayed_insertion_item(node);
815 if (!curr)
816 goto insert_end;
817
818 ret = btrfs_insert_delayed_item(trans, root, path, curr);
819 if (ret < 0) {
820 btrfs_release_path(path);
821 goto insert_end;
822 }
823
824 prev = curr;
825 curr = __btrfs_next_delayed_item(prev);
826 if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
827 /* insert the continuous items into the same leaf */
828 path->slots[0]++;
829 btrfs_batch_insert_items(trans, root, path, curr);
830 }
831 btrfs_release_delayed_item(prev);
832 btrfs_mark_buffer_dirty(path->nodes[0]);
833
834 btrfs_release_path(path);
835 mutex_unlock(&node->mutex);
836 goto do_again;
837
838insert_end:
839 mutex_unlock(&node->mutex);
840 return ret;
841}
842
843static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
844 struct btrfs_root *root,
845 struct btrfs_path *path,
846 struct btrfs_delayed_item *item)
847{
848 struct btrfs_delayed_item *curr, *next;
849 struct extent_buffer *leaf;
850 struct btrfs_key key;
851 struct list_head head;
852 int nitems, i, last_item;
853 int ret = 0;
854
855 BUG_ON(!path->nodes[0]);
856
857 leaf = path->nodes[0];
858
859 i = path->slots[0];
860 last_item = btrfs_header_nritems(leaf) - 1;
861 if (i > last_item)
862 return -ENOENT; /* FIXME: Is errno suitable? */
863
864 next = item;
865 INIT_LIST_HEAD(&head);
866 btrfs_item_key_to_cpu(leaf, &key, i);
867 nitems = 0;
868 /*
869 * count the number of the dir index items that we can delete in batch
870 */
871 while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
872 list_add_tail(&next->tree_list, &head);
873 nitems++;
874
875 curr = next;
876 next = __btrfs_next_delayed_item(curr);
877 if (!next)
878 break;
879
880 if (!btrfs_is_continuous_delayed_item(curr, next))
881 break;
882
883 i++;
884 if (i > last_item)
885 break;
886 btrfs_item_key_to_cpu(leaf, &key, i);
887 }
888
889 if (!nitems)
890 return 0;
891
892 ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
893 if (ret)
894 goto out;
895
896 list_for_each_entry_safe(curr, next, &head, tree_list) {
897 btrfs_delayed_item_release_metadata(root, curr);
898 list_del(&curr->tree_list);
899 btrfs_release_delayed_item(curr);
900 }
901
902out:
903 return ret;
904}
905
906static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
907 struct btrfs_path *path,
908 struct btrfs_root *root,
909 struct btrfs_delayed_node *node)
910{
911 struct btrfs_delayed_item *curr, *prev;
912 int ret = 0;
913
914do_again:
915 mutex_lock(&node->mutex);
916 curr = __btrfs_first_delayed_deletion_item(node);
917 if (!curr)
918 goto delete_fail;
919
920 ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
921 if (ret < 0)
922 goto delete_fail;
923 else if (ret > 0) {
924 /*
925 * can't find the item which the node points to, so this node
926 * is invalid, just drop it.
927 */
928 prev = curr;
929 curr = __btrfs_next_delayed_item(prev);
930 btrfs_release_delayed_item(prev);
931 ret = 0;
932 btrfs_release_path(path);
933 if (curr)
934 goto do_again;
935 else
936 goto delete_fail;
937 }
938
939 btrfs_batch_delete_items(trans, root, path, curr);
940 btrfs_release_path(path);
941 mutex_unlock(&node->mutex);
942 goto do_again;
943
944delete_fail:
945 btrfs_release_path(path);
946 mutex_unlock(&node->mutex);
947 return ret;
948}
949
950static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
951{
952 struct btrfs_delayed_root *delayed_root;
953
954 if (delayed_node && delayed_node->inode_dirty) {
955 BUG_ON(!delayed_node->root);
956 delayed_node->inode_dirty = 0;
957 delayed_node->count--;
958
959 delayed_root = delayed_node->root->fs_info->delayed_root;
960 atomic_dec(&delayed_root->items);
961 if (atomic_read(&delayed_root->items) <
962 BTRFS_DELAYED_BACKGROUND &&
963 waitqueue_active(&delayed_root->wait))
964 wake_up(&delayed_root->wait);
965 }
966}
967
968static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
969 struct btrfs_root *root,
970 struct btrfs_path *path,
971 struct btrfs_delayed_node *node)
972{
973 struct btrfs_key key;
974 struct btrfs_inode_item *inode_item;
975 struct extent_buffer *leaf;
976 int ret;
977
978 mutex_lock(&node->mutex);
979 if (!node->inode_dirty) {
980 mutex_unlock(&node->mutex);
981 return 0;
982 }
983
984 key.objectid = node->inode_id;
985 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
986 key.offset = 0;
987 ret = btrfs_lookup_inode(trans, root, path, &key, 1);
988 if (ret > 0) {
989 btrfs_release_path(path);
990 mutex_unlock(&node->mutex);
991 return -ENOENT;
992 } else if (ret < 0) {
993 mutex_unlock(&node->mutex);
994 return ret;
995 }
996
997 btrfs_unlock_up_safe(path, 1);
998 leaf = path->nodes[0];
999 inode_item = btrfs_item_ptr(leaf, path->slots[0],
1000 struct btrfs_inode_item);
1001 write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
1002 sizeof(struct btrfs_inode_item));
1003 btrfs_mark_buffer_dirty(leaf);
1004 btrfs_release_path(path);
1005
1006 btrfs_delayed_inode_release_metadata(root, node);
1007 btrfs_release_delayed_inode(node);
1008 mutex_unlock(&node->mutex);
1009
1010 return 0;
1011}
1012
1013/* Called when committing the transaction. */
1014int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1015 struct btrfs_root *root)
1016{
1017 struct btrfs_delayed_root *delayed_root;
1018 struct btrfs_delayed_node *curr_node, *prev_node;
1019 struct btrfs_path *path;
1020 struct btrfs_block_rsv *block_rsv;
1021 int ret = 0;
1022
1023 path = btrfs_alloc_path();
1024 if (!path)
1025 return -ENOMEM;
1026 path->leave_spinning = 1;
1027
1028 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv;
1030
1031 delayed_root = btrfs_get_delayed_root(root);
1032
1033 curr_node = btrfs_first_delayed_node(delayed_root);
1034 while (curr_node) {
1035 root = curr_node->root;
1036 ret = btrfs_insert_delayed_items(trans, path, root,
1037 curr_node);
1038 if (!ret)
1039 ret = btrfs_delete_delayed_items(trans, path, root,
1040 curr_node);
1041 if (!ret)
1042 ret = btrfs_update_delayed_inode(trans, root, path,
1043 curr_node);
1044 if (ret) {
1045 btrfs_release_delayed_node(curr_node);
1046 break;
1047 }
1048
1049 prev_node = curr_node;
1050 curr_node = btrfs_next_delayed_node(curr_node);
1051 btrfs_release_delayed_node(prev_node);
1052 }
1053
1054 btrfs_free_path(path);
1055 trans->block_rsv = block_rsv;
1056 return ret;
1057}
1058
1059static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1060 struct btrfs_delayed_node *node)
1061{
1062 struct btrfs_path *path;
1063 struct btrfs_block_rsv *block_rsv;
1064 int ret;
1065
1066 path = btrfs_alloc_path();
1067 if (!path)
1068 return -ENOMEM;
1069 path->leave_spinning = 1;
1070
1071 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv;
1073
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret)
1076 ret = btrfs_delete_delayed_items(trans, path, node->root, node);
1077 if (!ret)
1078 ret = btrfs_update_delayed_inode(trans, node->root, path, node);
1079 btrfs_free_path(path);
1080
1081 trans->block_rsv = block_rsv;
1082 return ret;
1083}
1084
1085int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1086 struct inode *inode)
1087{
1088 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1089 int ret;
1090
1091 if (!delayed_node)
1092 return 0;
1093
1094 mutex_lock(&delayed_node->mutex);
1095 if (!delayed_node->count) {
1096 mutex_unlock(&delayed_node->mutex);
1097 btrfs_release_delayed_node(delayed_node);
1098 return 0;
1099 }
1100 mutex_unlock(&delayed_node->mutex);
1101
1102 ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
1103 btrfs_release_delayed_node(delayed_node);
1104 return ret;
1105}
1106
1107void btrfs_remove_delayed_node(struct inode *inode)
1108{
1109 struct btrfs_delayed_node *delayed_node;
1110
1111 delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
1112 if (!delayed_node)
1113 return;
1114
1115 BTRFS_I(inode)->delayed_node = NULL;
1116 btrfs_release_delayed_node(delayed_node);
1117}
1118
1119struct btrfs_async_delayed_node {
1120 struct btrfs_root *root;
1121 struct btrfs_delayed_node *delayed_node;
1122 struct btrfs_work work;
1123};
1124
1125static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1126{
1127 struct btrfs_async_delayed_node *async_node;
1128 struct btrfs_trans_handle *trans;
1129 struct btrfs_path *path;
1130 struct btrfs_delayed_node *delayed_node = NULL;
1131 struct btrfs_root *root;
1132 struct btrfs_block_rsv *block_rsv;
1133 unsigned long nr = 0;
1134 int need_requeue = 0;
1135 int ret;
1136
1137 async_node = container_of(work, struct btrfs_async_delayed_node, work);
1138
1139 path = btrfs_alloc_path();
1140 if (!path)
1141 goto out;
1142 path->leave_spinning = 1;
1143
1144 delayed_node = async_node->delayed_node;
1145 root = delayed_node->root;
1146
1147 trans = btrfs_join_transaction(root);
1148 if (IS_ERR(trans))
1149 goto free_path;
1150
1151 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv;
1153
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret)
1156 ret = btrfs_delete_delayed_items(trans, path, root,
1157 delayed_node);
1158
1159 if (!ret)
1160 btrfs_update_delayed_inode(trans, root, path, delayed_node);
1161
1162 /*
1163 * Maybe new delayed items have been inserted, so we need requeue
1164 * the work. Besides that, we must dequeue the empty delayed nodes
1165 * to avoid the race between delayed items balance and the worker.
1166 * The race like this:
1167 * Task1 Worker thread
1168 * count == 0, needn't requeue
1169 * also needn't insert the
1170 * delayed node into prepare
1171 * list again.
1172 * add lots of delayed items
1173 * queue the delayed node
1174 * already in the list,
1175 * and not in the prepare
1176 * list, it means the delayed
1177 * node is being dealt with
1178 * by the worker.
1179 * do delayed items balance
1180 * the delayed node is being
1181 * dealt with by the worker
1182 * now, just wait.
1183 * the worker goto idle.
1184 * Task1 will sleep until the transaction is commited.
1185 */
1186 mutex_lock(&delayed_node->mutex);
1187 if (delayed_node->count)
1188 need_requeue = 1;
1189 else
1190 btrfs_dequeue_delayed_node(root->fs_info->delayed_root,
1191 delayed_node);
1192 mutex_unlock(&delayed_node->mutex);
1193
1194 nr = trans->blocks_used;
1195
1196 trans->block_rsv = block_rsv;
1197 btrfs_end_transaction_dmeta(trans, root);
1198 __btrfs_btree_balance_dirty(root, nr);
1199free_path:
1200 btrfs_free_path(path);
1201out:
1202 if (need_requeue)
1203 btrfs_requeue_work(&async_node->work);
1204 else {
1205 btrfs_release_prepared_delayed_node(delayed_node);
1206 kfree(async_node);
1207 }
1208}
1209
1210static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
1211 struct btrfs_root *root, int all)
1212{
1213 struct btrfs_async_delayed_node *async_node;
1214 struct btrfs_delayed_node *curr;
1215 int count = 0;
1216
1217again:
1218 curr = btrfs_first_prepared_delayed_node(delayed_root);
1219 if (!curr)
1220 return 0;
1221
1222 async_node = kmalloc(sizeof(*async_node), GFP_NOFS);
1223 if (!async_node) {
1224 btrfs_release_prepared_delayed_node(curr);
1225 return -ENOMEM;
1226 }
1227
1228 async_node->root = root;
1229 async_node->delayed_node = curr;
1230
1231 async_node->work.func = btrfs_async_run_delayed_node_done;
1232 async_node->work.flags = 0;
1233
1234 btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work);
1235 count++;
1236
1237 if (all || count < 4)
1238 goto again;
1239
1240 return 0;
1241}
1242
1243void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
1244{
1245 struct btrfs_delayed_root *delayed_root;
1246 delayed_root = btrfs_get_delayed_root(root);
1247 WARN_ON(btrfs_first_delayed_node(delayed_root));
1248}
1249
1250void btrfs_balance_delayed_items(struct btrfs_root *root)
1251{
1252 struct btrfs_delayed_root *delayed_root;
1253
1254 delayed_root = btrfs_get_delayed_root(root);
1255
1256 if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
1257 return;
1258
1259 if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
1260 int ret;
1261 ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);
1262 if (ret)
1263 return;
1264
1265 wait_event_interruptible_timeout(
1266 delayed_root->wait,
1267 (atomic_read(&delayed_root->items) <
1268 BTRFS_DELAYED_BACKGROUND),
1269 HZ);
1270 return;
1271 }
1272
1273 btrfs_wq_run_delayed_node(delayed_root, root, 0);
1274}
1275
1276int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
1277 struct btrfs_root *root, const char *name,
1278 int name_len, struct inode *dir,
1279 struct btrfs_disk_key *disk_key, u8 type,
1280 u64 index)
1281{
1282 struct btrfs_delayed_node *delayed_node;
1283 struct btrfs_delayed_item *delayed_item;
1284 struct btrfs_dir_item *dir_item;
1285 int ret;
1286
1287 delayed_node = btrfs_get_or_create_delayed_node(dir);
1288 if (IS_ERR(delayed_node))
1289 return PTR_ERR(delayed_node);
1290
1291 delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
1292 if (!delayed_item) {
1293 ret = -ENOMEM;
1294 goto release_node;
1295 }
1296
1297 ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
1298 /*
1299 * we have reserved enough space when we start a new transaction,
1300 * so reserving metadata failure is impossible
1301 */
1302 BUG_ON(ret);
1303
1304 delayed_item->key.objectid = btrfs_ino(dir);
1305 btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
1306 delayed_item->key.offset = index;
1307
1308 dir_item = (struct btrfs_dir_item *)delayed_item->data;
1309 dir_item->location = *disk_key;
1310 dir_item->transid = cpu_to_le64(trans->transid);
1311 dir_item->data_len = 0;
1312 dir_item->name_len = cpu_to_le16(name_len);
1313 dir_item->type = type;
1314 memcpy((char *)(dir_item + 1), name, name_len);
1315
1316 mutex_lock(&delayed_node->mutex);
1317 ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
1318 if (unlikely(ret)) {
1319 printk(KERN_ERR "err add delayed dir index item(name: %s) into "
1320 "the insertion tree of the delayed node"
1321 "(root id: %llu, inode id: %llu, errno: %d)\n",
1322 name,
1323 (unsigned long long)delayed_node->root->objectid,
1324 (unsigned long long)delayed_node->inode_id,
1325 ret);
1326 BUG();
1327 }
1328 mutex_unlock(&delayed_node->mutex);
1329
1330release_node:
1331 btrfs_release_delayed_node(delayed_node);
1332 return ret;
1333}
1334
1335static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
1336 struct btrfs_delayed_node *node,
1337 struct btrfs_key *key)
1338{
1339 struct btrfs_delayed_item *item;
1340
1341 mutex_lock(&node->mutex);
1342 item = __btrfs_lookup_delayed_insertion_item(node, key);
1343 if (!item) {
1344 mutex_unlock(&node->mutex);
1345 return 1;
1346 }
1347
1348 btrfs_delayed_item_release_metadata(root, item);
1349 btrfs_release_delayed_item(item);
1350 mutex_unlock(&node->mutex);
1351 return 0;
1352}
1353
1354int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
1355 struct btrfs_root *root, struct inode *dir,
1356 u64 index)
1357{
1358 struct btrfs_delayed_node *node;
1359 struct btrfs_delayed_item *item;
1360 struct btrfs_key item_key;
1361 int ret;
1362
1363 node = btrfs_get_or_create_delayed_node(dir);
1364 if (IS_ERR(node))
1365 return PTR_ERR(node);
1366
1367 item_key.objectid = btrfs_ino(dir);
1368 btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
1369 item_key.offset = index;
1370
1371 ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
1372 if (!ret)
1373 goto end;
1374
1375 item = btrfs_alloc_delayed_item(0);
1376 if (!item) {
1377 ret = -ENOMEM;
1378 goto end;
1379 }
1380
1381 item->key = item_key;
1382
1383 ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
1384 /*
1385 * we have reserved enough space when we start a new transaction,
1386 * so reserving metadata failure is impossible.
1387 */
1388 BUG_ON(ret);
1389
1390 mutex_lock(&node->mutex);
1391 ret = __btrfs_add_delayed_deletion_item(node, item);
1392 if (unlikely(ret)) {
1393 printk(KERN_ERR "err add delayed dir index item(index: %llu) "
1394 "into the deletion tree of the delayed node"
1395 "(root id: %llu, inode id: %llu, errno: %d)\n",
1396 (unsigned long long)index,
1397 (unsigned long long)node->root->objectid,
1398 (unsigned long long)node->inode_id,
1399 ret);
1400 BUG();
1401 }
1402 mutex_unlock(&node->mutex);
1403end:
1404 btrfs_release_delayed_node(node);
1405 return ret;
1406}
1407
1408int btrfs_inode_delayed_dir_index_count(struct inode *inode)
1409{
1410 struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
1411
1412 if (!delayed_node)
1413 return -ENOENT;
1414
1415 /*
1416 * Since we have held i_mutex of this directory, it is impossible that
1417 * a new directory index is added into the delayed node and index_cnt
1418 * is updated now. So we needn't lock the delayed node.
1419 */
1420 if (!delayed_node->index_cnt) {
1421 btrfs_release_delayed_node(delayed_node);
1422 return -EINVAL;
1423 }
1424
1425 BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
1426 btrfs_release_delayed_node(delayed_node);
1427 return 0;
1428}
1429
1430void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
1431 struct list_head *del_list)
1432{
1433 struct btrfs_delayed_node *delayed_node;
1434 struct btrfs_delayed_item *item;
1435
1436 delayed_node = btrfs_get_delayed_node(inode);
1437 if (!delayed_node)
1438 return;
1439
1440 mutex_lock(&delayed_node->mutex);
1441 item = __btrfs_first_delayed_insertion_item(delayed_node);
1442 while (item) {
1443 atomic_inc(&item->refs);
1444 list_add_tail(&item->readdir_list, ins_list);
1445 item = __btrfs_next_delayed_item(item);
1446 }
1447
1448 item = __btrfs_first_delayed_deletion_item(delayed_node);
1449 while (item) {
1450 atomic_inc(&item->refs);
1451 list_add_tail(&item->readdir_list, del_list);
1452 item = __btrfs_next_delayed_item(item);
1453 }
1454 mutex_unlock(&delayed_node->mutex);
1455 /*
1456 * This delayed node is still cached in the btrfs inode, so refs
1457 * must be > 1 now, and we needn't check it is going to be freed
1458 * or not.
1459 *
1460 * Besides that, this function is used to read dir, we do not
1461 * insert/delete delayed items in this period. So we also needn't
1462 * requeue or dequeue this delayed node.
1463 */
1464 atomic_dec(&delayed_node->refs);
1465}
1466
1467void btrfs_put_delayed_items(struct list_head *ins_list,
1468 struct list_head *del_list)
1469{
1470 struct btrfs_delayed_item *curr, *next;
1471
1472 list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1473 list_del(&curr->readdir_list);
1474 if (atomic_dec_and_test(&curr->refs))
1475 kfree(curr);
1476 }
1477
1478 list_for_each_entry_safe(curr, next, del_list, readdir_list) {
1479 list_del(&curr->readdir_list);
1480 if (atomic_dec_and_test(&curr->refs))
1481 kfree(curr);
1482 }
1483}
1484
1485int btrfs_should_delete_dir_index(struct list_head *del_list,
1486 u64 index)
1487{
1488 struct btrfs_delayed_item *curr, *next;
1489 int ret;
1490
1491 if (list_empty(del_list))
1492 return 0;
1493
1494 list_for_each_entry_safe(curr, next, del_list, readdir_list) {
1495 if (curr->key.offset > index)
1496 break;
1497
1498 list_del(&curr->readdir_list);
1499 ret = (curr->key.offset == index);
1500
1501 if (atomic_dec_and_test(&curr->refs))
1502 kfree(curr);
1503
1504 if (ret)
1505 return 1;
1506 else
1507 continue;
1508 }
1509 return 0;
1510}
1511
1512/*
1513 * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
1514 *
1515 */
1516int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
1517 filldir_t filldir,
1518 struct list_head *ins_list)
1519{
1520 struct btrfs_dir_item *di;
1521 struct btrfs_delayed_item *curr, *next;
1522 struct btrfs_key location;
1523 char *name;
1524 int name_len;
1525 int over = 0;
1526 unsigned char d_type;
1527
1528 if (list_empty(ins_list))
1529 return 0;
1530
1531 /*
1532 * Changing the data of the delayed item is impossible. So
1533 * we needn't lock them. And we have held i_mutex of the
1534 * directory, nobody can delete any directory indexes now.
1535 */
1536 list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
1537 list_del(&curr->readdir_list);
1538
1539 if (curr->key.offset < filp->f_pos) {
1540 if (atomic_dec_and_test(&curr->refs))
1541 kfree(curr);
1542 continue;
1543 }
1544
1545 filp->f_pos = curr->key.offset;
1546
1547 di = (struct btrfs_dir_item *)curr->data;
1548 name = (char *)(di + 1);
1549 name_len = le16_to_cpu(di->name_len);
1550
1551 d_type = btrfs_filetype_table[di->type];
1552 btrfs_disk_key_to_cpu(&location, &di->location);
1553
1554 over = filldir(dirent, name, name_len, curr->key.offset,
1555 location.objectid, d_type);
1556
1557 if (atomic_dec_and_test(&curr->refs))
1558 kfree(curr);
1559
1560 if (over)
1561 return 1;
1562 }
1563 return 0;
1564}
1565
1566BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
1567 generation, 64);
1568BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
1569 sequence, 64);
1570BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
1571 transid, 64);
1572BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
1573BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
1574 nbytes, 64);
1575BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
1576 block_group, 64);
1577BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
1578BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
1579BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
1580BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
1581BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
1582BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
1583
1584BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
1585BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
1586
1587static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
1588 struct btrfs_inode_item *inode_item,
1589 struct inode *inode)
1590{
1591 btrfs_set_stack_inode_uid(inode_item, inode->i_uid);
1592 btrfs_set_stack_inode_gid(inode_item, inode->i_gid);
1593 btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
1594 btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
1595 btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
1596 btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
1597 btrfs_set_stack_inode_generation(inode_item,
1598 BTRFS_I(inode)->generation);
1599 btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
1600 btrfs_set_stack_inode_transid(inode_item, trans->transid);
1601 btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
1602 btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
1603 btrfs_set_stack_inode_block_group(inode_item, 0);
1604
1605 btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
1606 inode->i_atime.tv_sec);
1607 btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
1608 inode->i_atime.tv_nsec);
1609
1610 btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
1611 inode->i_mtime.tv_sec);
1612 btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
1613 inode->i_mtime.tv_nsec);
1614
1615 btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
1616 inode->i_ctime.tv_sec);
1617 btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
1618 inode->i_ctime.tv_nsec);
1619}
1620
1621int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1622{
1623 struct btrfs_delayed_node *delayed_node;
1624 struct btrfs_inode_item *inode_item;
1625 struct btrfs_timespec *tspec;
1626
1627 delayed_node = btrfs_get_delayed_node(inode);
1628 if (!delayed_node)
1629 return -ENOENT;
1630
1631 mutex_lock(&delayed_node->mutex);
1632 if (!delayed_node->inode_dirty) {
1633 mutex_unlock(&delayed_node->mutex);
1634 btrfs_release_delayed_node(delayed_node);
1635 return -ENOENT;
1636 }
1637
1638 inode_item = &delayed_node->inode_item;
1639
1640 inode->i_uid = btrfs_stack_inode_uid(inode_item);
1641 inode->i_gid = btrfs_stack_inode_gid(inode_item);
1642 btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
1643 inode->i_mode = btrfs_stack_inode_mode(inode_item);
1644 inode->i_nlink = btrfs_stack_inode_nlink(inode_item);
1645 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1646 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1647 BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
1648 inode->i_rdev = 0;
1649 *rdev = btrfs_stack_inode_rdev(inode_item);
1650 BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
1651
1652 tspec = btrfs_inode_atime(inode_item);
1653 inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
1654 inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
1655
1656 tspec = btrfs_inode_mtime(inode_item);
1657 inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
1658 inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
1659
1660 tspec = btrfs_inode_ctime(inode_item);
1661 inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
1662 inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
1663
1664 inode->i_generation = BTRFS_I(inode)->generation;
1665 BTRFS_I(inode)->index_cnt = (u64)-1;
1666
1667 mutex_unlock(&delayed_node->mutex);
1668 btrfs_release_delayed_node(delayed_node);
1669 return 0;
1670}
1671
1672int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1673 struct btrfs_root *root, struct inode *inode)
1674{
1675 struct btrfs_delayed_node *delayed_node;
1676 int ret = 0;
1677
1678 delayed_node = btrfs_get_or_create_delayed_node(inode);
1679 if (IS_ERR(delayed_node))
1680 return PTR_ERR(delayed_node);
1681
1682 mutex_lock(&delayed_node->mutex);
1683 if (delayed_node->inode_dirty) {
1684 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1685 goto release_node;
1686 }
1687
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
1689 /*
1690 * we must reserve enough space when we start a new transaction,
1691 * so reserving metadata failure is impossible
1692 */
1693 BUG_ON(ret);
1694
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1;
1697 delayed_node->count++;
1698 atomic_inc(&root->fs_info->delayed_root->items);
1699release_node:
1700 mutex_unlock(&delayed_node->mutex);
1701 btrfs_release_delayed_node(delayed_node);
1702 return ret;
1703}
1704
1705static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
1706{
1707 struct btrfs_root *root = delayed_node->root;
1708 struct btrfs_delayed_item *curr_item, *prev_item;
1709
1710 mutex_lock(&delayed_node->mutex);
1711 curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
1712 while (curr_item) {
1713 btrfs_delayed_item_release_metadata(root, curr_item);
1714 prev_item = curr_item;
1715 curr_item = __btrfs_next_delayed_item(prev_item);
1716 btrfs_release_delayed_item(prev_item);
1717 }
1718
1719 curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
1720 while (curr_item) {
1721 btrfs_delayed_item_release_metadata(root, curr_item);
1722 prev_item = curr_item;
1723 curr_item = __btrfs_next_delayed_item(prev_item);
1724 btrfs_release_delayed_item(prev_item);
1725 }
1726
1727 if (delayed_node->inode_dirty) {
1728 btrfs_delayed_inode_release_metadata(root, delayed_node);
1729 btrfs_release_delayed_inode(delayed_node);
1730 }
1731 mutex_unlock(&delayed_node->mutex);
1732}
1733
1734void btrfs_kill_delayed_inode_items(struct inode *inode)
1735{
1736 struct btrfs_delayed_node *delayed_node;
1737
1738 delayed_node = btrfs_get_delayed_node(inode);
1739 if (!delayed_node)
1740 return;
1741
1742 __btrfs_kill_delayed_node(delayed_node);
1743 btrfs_release_delayed_node(delayed_node);
1744}
1745
1746void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
1747{
1748 u64 inode_id = 0;
1749 struct btrfs_delayed_node *delayed_nodes[8];
1750 int i, n;
1751
1752 while (1) {
1753 spin_lock(&root->inode_lock);
1754 n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
1755 (void **)delayed_nodes, inode_id,
1756 ARRAY_SIZE(delayed_nodes));
1757 if (!n) {
1758 spin_unlock(&root->inode_lock);
1759 break;
1760 }
1761
1762 inode_id = delayed_nodes[n - 1]->inode_id + 1;
1763
1764 for (i = 0; i < n; i++)
1765 atomic_inc(&delayed_nodes[i]->refs);
1766 spin_unlock(&root->inode_lock);
1767
1768 for (i = 0; i < n; i++) {
1769 __btrfs_kill_delayed_node(delayed_nodes[i]);
1770 btrfs_release_delayed_node(delayed_nodes[i]);
1771 }
1772 }
1773}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
new file mode 100644
index 000000000000..8d27af4bd8b9
--- /dev/null
+++ b/fs/btrfs/delayed-inode.h
@@ -0,0 +1,145 @@
1/*
2 * Copyright (C) 2011 Fujitsu. All rights reserved.
3 * Written by Miao Xie <miaox@cn.fujitsu.com>
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public
7 * License v2 as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 021110-1307, USA.
18 */
19
20#ifndef __DELAYED_TREE_OPERATION_H
21#define __DELAYED_TREE_OPERATION_H
22
23#include <linux/rbtree.h>
24#include <linux/spinlock.h>
25#include <linux/mutex.h>
26#include <linux/list.h>
27#include <linux/wait.h>
28#include <asm/atomic.h>
29
30#include "ctree.h"
31
32/* types of the delayed item */
33#define BTRFS_DELAYED_INSERTION_ITEM 1
34#define BTRFS_DELAYED_DELETION_ITEM 2
35
36struct btrfs_delayed_root {
37 spinlock_t lock;
38 struct list_head node_list;
39 /*
40 * Used for delayed nodes which is waiting to be dealt with by the
41 * worker. If the delayed node is inserted into the work queue, we
42 * drop it from this list.
43 */
44 struct list_head prepare_list;
45 atomic_t items; /* for delayed items */
46 int nodes; /* for delayed nodes */
47 wait_queue_head_t wait;
48};
49
50struct btrfs_delayed_node {
51 u64 inode_id;
52 u64 bytes_reserved;
53 struct btrfs_root *root;
54 /* Used to add the node into the delayed root's node list. */
55 struct list_head n_list;
56 /*
57 * Used to add the node into the prepare list, the nodes in this list
58 * is waiting to be dealt with by the async worker.
59 */
60 struct list_head p_list;
61 struct rb_root ins_root;
62 struct rb_root del_root;
63 struct mutex mutex;
64 struct btrfs_inode_item inode_item;
65 atomic_t refs;
66 u64 index_cnt;
67 bool in_list;
68 bool inode_dirty;
69 int count;
70};
71
72struct btrfs_delayed_item {
73 struct rb_node rb_node;
74 struct btrfs_key key;
75 struct list_head tree_list; /* used for batch insert/delete items */
76 struct list_head readdir_list; /* used for readdir items */
77 u64 bytes_reserved;
78 struct btrfs_delayed_node *delayed_node;
79 atomic_t refs;
80 int ins_or_del;
81 u32 data_len;
82 char data[0];
83};
84
85static inline void btrfs_init_delayed_root(
86 struct btrfs_delayed_root *delayed_root)
87{
88 atomic_set(&delayed_root->items, 0);
89 delayed_root->nodes = 0;
90 spin_lock_init(&delayed_root->lock);
91 init_waitqueue_head(&delayed_root->wait);
92 INIT_LIST_HEAD(&delayed_root->node_list);
93 INIT_LIST_HEAD(&delayed_root->prepare_list);
94}
95
96int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, const char *name,
98 int name_len, struct inode *dir,
99 struct btrfs_disk_key *disk_key, u8 type,
100 u64 index);
101
102int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root, struct inode *dir,
104 u64 index);
105
106int btrfs_inode_delayed_dir_index_count(struct inode *inode);
107
108int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root);
110
111void btrfs_balance_delayed_items(struct btrfs_root *root);
112
113int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
114 struct inode *inode);
115/* Used for evicting the inode. */
116void btrfs_remove_delayed_node(struct inode *inode);
117void btrfs_kill_delayed_inode_items(struct inode *inode);
118
119
120int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
121 struct btrfs_root *root, struct inode *inode);
122int btrfs_fill_inode(struct inode *inode, u32 *rdev);
123
124/* Used for drop dead root */
125void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
126
127/* Used for readdir() */
128void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
129 struct list_head *del_list);
130void btrfs_put_delayed_items(struct list_head *ins_list,
131 struct list_head *del_list);
132int btrfs_should_delete_dir_index(struct list_head *del_list,
133 u64 index);
134int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
135 filldir_t filldir,
136 struct list_head *ins_list);
137
138/* for init */
139int __init btrfs_delayed_inode_init(void);
140void btrfs_delayed_inode_exit(void);
141
142/* for debugging */
143void btrfs_assert_delayed_root_empty(struct btrfs_root *root);
144
145#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e807b143b857..125cf76fcd08 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -281,44 +281,6 @@ again:
281} 281}
282 282
283/* 283/*
284 * This checks to see if there are any delayed refs in the
285 * btree for a given bytenr. It returns one if it finds any
286 * and zero otherwise.
287 *
288 * If it only finds a head node, it returns 0.
289 *
290 * The idea is to use this when deciding if you can safely delete an
291 * extent from the extent allocation tree. There may be a pending
292 * ref in the rbtree that adds or removes references, so as long as this
293 * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
294 * allocation tree.
295 */
296int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
297{
298 struct btrfs_delayed_ref_node *ref;
299 struct btrfs_delayed_ref_root *delayed_refs;
300 struct rb_node *prev_node;
301 int ret = 0;
302
303 delayed_refs = &trans->transaction->delayed_refs;
304 spin_lock(&delayed_refs->lock);
305
306 ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
307 if (ref) {
308 prev_node = rb_prev(&ref->rb_node);
309 if (!prev_node)
310 goto out;
311 ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
312 rb_node);
313 if (ref->bytenr == bytenr)
314 ret = 1;
315 }
316out:
317 spin_unlock(&delayed_refs->lock);
318 return ret;
319}
320
321/*
322 * helper function to update an extent delayed ref in the 284 * helper function to update an extent delayed ref in the
323 * rbtree. existing and update must both have the same 285 * rbtree. existing and update must both have the same
324 * bytenr and parent 286 * bytenr and parent
@@ -483,6 +445,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
483 INIT_LIST_HEAD(&head_ref->cluster); 445 INIT_LIST_HEAD(&head_ref->cluster);
484 mutex_init(&head_ref->mutex); 446 mutex_init(&head_ref->mutex);
485 447
448 trace_btrfs_delayed_ref_head(ref, head_ref, action);
449
486 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 450 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
487 451
488 if (existing) { 452 if (existing) {
@@ -537,6 +501,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
537 } 501 }
538 full_ref->level = level; 502 full_ref->level = level;
539 503
504 trace_btrfs_delayed_tree_ref(ref, full_ref, action);
505
540 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 506 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
541 507
542 if (existing) { 508 if (existing) {
@@ -591,6 +557,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
591 full_ref->objectid = owner; 557 full_ref->objectid = owner;
592 full_ref->offset = offset; 558 full_ref->offset = offset;
593 559
560 trace_btrfs_delayed_data_ref(ref, full_ref, action);
561
594 existing = tree_insert(&delayed_refs->root, &ref->rb_node); 562 existing = tree_insert(&delayed_refs->root, &ref->rb_node);
595 563
596 if (existing) { 564 if (existing) {
@@ -741,79 +709,3 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
741 return btrfs_delayed_node_to_head(ref); 709 return btrfs_delayed_node_to_head(ref);
742 return NULL; 710 return NULL;
743} 711}
744
745/*
746 * add a delayed ref to the tree. This does all of the accounting required
747 * to make sure the delayed ref is eventually processed before this
748 * transaction commits.
749 *
750 * The main point of this call is to add and remove a backreference in a single
751 * shot, taking the lock only once, and only searching for the head node once.
752 *
753 * It is the same as doing a ref add and delete in two separate calls.
754 */
755#if 0
756int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
757 u64 bytenr, u64 num_bytes, u64 orig_parent,
758 u64 parent, u64 orig_ref_root, u64 ref_root,
759 u64 orig_ref_generation, u64 ref_generation,
760 u64 owner_objectid, int pin)
761{
762 struct btrfs_delayed_ref *ref;
763 struct btrfs_delayed_ref *old_ref;
764 struct btrfs_delayed_ref_head *head_ref;
765 struct btrfs_delayed_ref_root *delayed_refs;
766 int ret;
767
768 ref = kmalloc(sizeof(*ref), GFP_NOFS);
769 if (!ref)
770 return -ENOMEM;
771
772 old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
773 if (!old_ref) {
774 kfree(ref);
775 return -ENOMEM;
776 }
777
778 /*
779 * the parent = 0 case comes from cases where we don't actually
780 * know the parent yet. It will get updated later via a add/drop
781 * pair.
782 */
783 if (parent == 0)
784 parent = bytenr;
785 if (orig_parent == 0)
786 orig_parent = bytenr;
787
788 head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
789 if (!head_ref) {
790 kfree(ref);
791 kfree(old_ref);
792 return -ENOMEM;
793 }
794 delayed_refs = &trans->transaction->delayed_refs;
795 spin_lock(&delayed_refs->lock);
796
797 /*
798 * insert both the head node and the new ref without dropping
799 * the spin lock
800 */
801 ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
802 (u64)-1, 0, 0, 0,
803 BTRFS_UPDATE_DELAYED_HEAD, 0);
804 BUG_ON(ret);
805
806 ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
807 parent, ref_root, ref_generation,
808 owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
809 BUG_ON(ret);
810
811 ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
812 orig_parent, orig_ref_root,
813 orig_ref_generation, owner_objectid,
814 BTRFS_DROP_DELAYED_REF, pin);
815 BUG_ON(ret);
816 spin_unlock(&delayed_refs->lock);
817 return 0;
818}
819#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 50e3cf92fbda..e287e3b0eab0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -166,12 +166,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
166 166
167struct btrfs_delayed_ref_head * 167struct btrfs_delayed_ref_head *
168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr); 168btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
169int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
170int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
171 u64 bytenr, u64 num_bytes, u64 orig_parent,
172 u64 parent, u64 orig_ref_root, u64 ref_root,
173 u64 orig_ref_generation, u64 ref_generation,
174 u64 owner_objectid, int pin);
175int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, 169int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
176 struct btrfs_delayed_ref_head *head); 170 struct btrfs_delayed_ref_head *head);
177int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans, 171int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa49..685f2593c4f0 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -50,7 +50,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
50 if (di) 50 if (di)
51 return ERR_PTR(-EEXIST); 51 return ERR_PTR(-EEXIST);
52 ret = btrfs_extend_item(trans, root, path, data_size); 52 ret = btrfs_extend_item(trans, root, path, data_size);
53 WARN_ON(ret > 0);
54 } 53 }
55 if (ret < 0) 54 if (ret < 0)
56 return ERR_PTR(ret); 55 return ERR_PTR(ret);
@@ -124,8 +123,9 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
124 * to use for the second index (if one is created). 123 * to use for the second index (if one is created).
125 */ 124 */
126int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root 125int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
127 *root, const char *name, int name_len, u64 dir, 126 *root, const char *name, int name_len,
128 struct btrfs_key *location, u8 type, u64 index) 127 struct inode *dir, struct btrfs_key *location,
128 u8 type, u64 index)
129{ 129{
130 int ret = 0; 130 int ret = 0;
131 int ret2 = 0; 131 int ret2 = 0;
@@ -137,13 +137,17 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
137 struct btrfs_disk_key disk_key; 137 struct btrfs_disk_key disk_key;
138 u32 data_size; 138 u32 data_size;
139 139
140 key.objectid = dir; 140 key.objectid = btrfs_ino(dir);
141 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); 141 btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
142 key.offset = btrfs_name_hash(name, name_len); 142 key.offset = btrfs_name_hash(name, name_len);
143 143
144 path = btrfs_alloc_path(); 144 path = btrfs_alloc_path();
145 if (!path)
146 return -ENOMEM;
145 path->leave_spinning = 1; 147 path->leave_spinning = 1;
146 148
149 btrfs_cpu_key_to_disk(&disk_key, location);
150
147 data_size = sizeof(*dir_item) + name_len; 151 data_size = sizeof(*dir_item) + name_len;
148 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 152 dir_item = insert_with_overflow(trans, root, path, &key, data_size,
149 name, name_len); 153 name, name_len);
@@ -151,11 +155,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
151 ret = PTR_ERR(dir_item); 155 ret = PTR_ERR(dir_item);
152 if (ret == -EEXIST) 156 if (ret == -EEXIST)
153 goto second_insert; 157 goto second_insert;
154 goto out; 158 goto out_free;
155 } 159 }
156 160
157 leaf = path->nodes[0]; 161 leaf = path->nodes[0];
158 btrfs_cpu_key_to_disk(&disk_key, location);
159 btrfs_set_dir_item_key(leaf, dir_item, &disk_key); 162 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
160 btrfs_set_dir_type(leaf, dir_item, type); 163 btrfs_set_dir_type(leaf, dir_item, type);
161 btrfs_set_dir_data_len(leaf, dir_item, 0); 164 btrfs_set_dir_data_len(leaf, dir_item, 0);
@@ -170,29 +173,13 @@ second_insert:
170 /* FIXME, use some real flag for selecting the extra index */ 173 /* FIXME, use some real flag for selecting the extra index */
171 if (root == root->fs_info->tree_root) { 174 if (root == root->fs_info->tree_root) {
172 ret = 0; 175 ret = 0;
173 goto out; 176 goto out_free;
174 } 177 }
175 btrfs_release_path(root, path); 178 btrfs_release_path(path);
176 179
177 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 180 ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
178 key.offset = index; 181 &disk_key, type, index);
179 dir_item = insert_with_overflow(trans, root, path, &key, data_size, 182out_free:
180 name, name_len);
181 if (IS_ERR(dir_item)) {
182 ret2 = PTR_ERR(dir_item);
183 goto out;
184 }
185 leaf = path->nodes[0];
186 btrfs_cpu_key_to_disk(&disk_key, location);
187 btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
188 btrfs_set_dir_type(leaf, dir_item, type);
189 btrfs_set_dir_data_len(leaf, dir_item, 0);
190 btrfs_set_dir_name_len(leaf, dir_item, name_len);
191 btrfs_set_dir_transid(leaf, dir_item, trans->transid);
192 name_ptr = (unsigned long)(dir_item + 1);
193 write_extent_buffer(leaf, name, name_ptr, name_len);
194 btrfs_mark_buffer_dirty(leaf);
195out:
196 btrfs_free_path(path); 183 btrfs_free_path(path);
197 if (ret) 184 if (ret)
198 return ret; 185 return ret;
@@ -377,6 +364,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
377 364
378 leaf = path->nodes[0]; 365 leaf = path->nodes[0];
379 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); 366 dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
367 if (verify_dir_item(root, leaf, dir_item))
368 return NULL;
369
380 total_len = btrfs_item_size_nr(leaf, path->slots[0]); 370 total_len = btrfs_item_size_nr(leaf, path->slots[0]);
381 while (cur < total_len) { 371 while (cur < total_len) {
382 this_len = sizeof(*dir_item) + 372 this_len = sizeof(*dir_item) +
@@ -427,5 +417,37 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
427 ret = btrfs_truncate_item(trans, root, path, 417 ret = btrfs_truncate_item(trans, root, path,
428 item_len - sub_item_len, 1); 418 item_len - sub_item_len, 1);
429 } 419 }
420 return ret;
421}
422
423int verify_dir_item(struct btrfs_root *root,
424 struct extent_buffer *leaf,
425 struct btrfs_dir_item *dir_item)
426{
427 u16 namelen = BTRFS_NAME_LEN;
428 u8 type = btrfs_dir_type(leaf, dir_item);
429
430 if (type >= BTRFS_FT_MAX) {
431 printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
432 (int)type);
433 return 1;
434 }
435
436 if (type == BTRFS_FT_XATTR)
437 namelen = XATTR_NAME_MAX;
438
439 if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
440 printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n",
441 (unsigned)btrfs_dir_data_len(leaf, dir_item));
442 return 1;
443 }
444
445 /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
446 if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
447 printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
448 (unsigned)btrfs_dir_data_len(leaf, dir_item));
449 return 1;
450 }
451
430 return 0; 452 return 0;
431} 453}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..1ac8db5dc0a3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,9 @@
28#include <linux/freezer.h> 28#include <linux/freezer.h>
29#include <linux/crc32c.h> 29#include <linux/crc32c.h>
30#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/migrate.h>
32#include <linux/ratelimit.h>
33#include <asm/unaligned.h>
31#include "compat.h" 34#include "compat.h"
32#include "ctree.h" 35#include "ctree.h"
33#include "disk-io.h" 36#include "disk-io.h"
@@ -39,10 +42,25 @@
39#include "locking.h" 42#include "locking.h"
40#include "tree-log.h" 43#include "tree-log.h"
41#include "free-space-cache.h" 44#include "free-space-cache.h"
45#include "inode-map.h"
42 46
43static struct extent_io_ops btree_extent_io_ops; 47static struct extent_io_ops btree_extent_io_ops;
44static void end_workqueue_fn(struct btrfs_work *work); 48static void end_workqueue_fn(struct btrfs_work *work);
45static void free_fs_root(struct btrfs_root *root); 49static void free_fs_root(struct btrfs_root *root);
50static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
51 int read_only);
52static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
53static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
54static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
55 struct btrfs_root *root);
56static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
57static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
58static int btrfs_destroy_marked_extents(struct btrfs_root *root,
59 struct extent_io_tree *dirty_pages,
60 int mark);
61static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
62 struct extent_io_tree *pinned_extents);
63static int btrfs_cleanup_transaction(struct btrfs_root *root);
46 64
47/* 65/*
48 * end_io_wq structs are used to do processing in task context when an IO is 66 * end_io_wq structs are used to do processing in task context when an IO is
@@ -121,7 +139,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
121 * that covers the entire device 139 * that covers the entire device
122 */ 140 */
123static struct extent_map *btree_get_extent(struct inode *inode, 141static struct extent_map *btree_get_extent(struct inode *inode,
124 struct page *page, size_t page_offset, u64 start, u64 len, 142 struct page *page, size_t pg_offset, u64 start, u64 len,
125 int create) 143 int create)
126{ 144{
127 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 145 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -138,7 +156,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
138 } 156 }
139 read_unlock(&em_tree->lock); 157 read_unlock(&em_tree->lock);
140 158
141 em = alloc_extent_map(GFP_NOFS); 159 em = alloc_extent_map();
142 if (!em) { 160 if (!em) {
143 em = ERR_PTR(-ENOMEM); 161 em = ERR_PTR(-ENOMEM);
144 goto out; 162 goto out;
@@ -183,7 +201,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
183 201
184void btrfs_csum_final(u32 crc, char *result) 202void btrfs_csum_final(u32 crc, char *result)
185{ 203{
186 *(__le32 *)result = ~cpu_to_le32(crc); 204 put_unaligned_le32(~crc, result);
187} 205}
188 206
189/* 207/*
@@ -238,14 +256,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
238 memcpy(&found, result, csum_size); 256 memcpy(&found, result, csum_size);
239 257
240 read_extent_buffer(buf, &val, 0, csum_size); 258 read_extent_buffer(buf, &val, 0, csum_size);
241 if (printk_ratelimit()) { 259 printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
242 printk(KERN_INFO "btrfs: %s checksum verify "
243 "failed on %llu wanted %X found %X " 260 "failed on %llu wanted %X found %X "
244 "level %d\n", 261 "level %d\n",
245 root->fs_info->sb->s_id, 262 root->fs_info->sb->s_id,
246 (unsigned long long)buf->start, val, found, 263 (unsigned long long)buf->start, val, found,
247 btrfs_header_level(buf)); 264 btrfs_header_level(buf));
248 }
249 if (result != (char *)&inline_result) 265 if (result != (char *)&inline_result)
250 kfree(result); 266 kfree(result);
251 return 1; 267 return 1;
@@ -280,13 +296,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
280 ret = 0; 296 ret = 0;
281 goto out; 297 goto out;
282 } 298 }
283 if (printk_ratelimit()) { 299 printk_ratelimited("parent transid verify failed on %llu wanted %llu "
284 printk("parent transid verify failed on %llu wanted %llu "
285 "found %llu\n", 300 "found %llu\n",
286 (unsigned long long)eb->start, 301 (unsigned long long)eb->start,
287 (unsigned long long)parent_transid, 302 (unsigned long long)parent_transid,
288 (unsigned long long)btrfs_header_generation(eb)); 303 (unsigned long long)btrfs_header_generation(eb));
289 }
290 ret = 1; 304 ret = 1;
291 clear_extent_buffer_uptodate(io_tree, eb, &cached_state); 305 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
292out: 306out:
@@ -308,6 +322,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
308 int num_copies = 0; 322 int num_copies = 0;
309 int mirror_num = 0; 323 int mirror_num = 0;
310 324
325 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
311 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 326 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
312 while (1) { 327 while (1) {
313 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 328 ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -316,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
316 !verify_parent_transid(io_tree, eb, parent_transid)) 331 !verify_parent_transid(io_tree, eb, parent_transid))
317 return ret; 332 return ret;
318 333
334 /*
335 * This buffer's crc is fine, but its contents are corrupted, so
336 * there is no reason to read the other copies, they won't be
337 * any less wrong.
338 */
339 if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
340 return ret;
341
319 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, 342 num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
320 eb->start, eb->len); 343 eb->start, eb->len);
321 if (num_copies == 1) 344 if (num_copies == 1)
@@ -338,24 +361,33 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
338 struct extent_io_tree *tree; 361 struct extent_io_tree *tree;
339 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 362 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
340 u64 found_start; 363 u64 found_start;
341 int found_level;
342 unsigned long len; 364 unsigned long len;
343 struct extent_buffer *eb; 365 struct extent_buffer *eb;
344 int ret; 366 int ret;
345 367
346 tree = &BTRFS_I(page->mapping->host)->io_tree; 368 tree = &BTRFS_I(page->mapping->host)->io_tree;
347 369
348 if (page->private == EXTENT_PAGE_PRIVATE) 370 if (page->private == EXTENT_PAGE_PRIVATE) {
371 WARN_ON(1);
349 goto out; 372 goto out;
350 if (!page->private) 373 }
374 if (!page->private) {
375 WARN_ON(1);
351 goto out; 376 goto out;
377 }
352 len = page->private >> 2; 378 len = page->private >> 2;
353 WARN_ON(len == 0); 379 WARN_ON(len == 0);
354 380
355 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 381 eb = alloc_extent_buffer(tree, start, len, page);
382 if (eb == NULL) {
383 WARN_ON(1);
384 goto out;
385 }
356 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, 386 ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
357 btrfs_header_generation(eb)); 387 btrfs_header_generation(eb));
358 BUG_ON(ret); 388 BUG_ON(ret);
389 WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
390
359 found_start = btrfs_header_bytenr(eb); 391 found_start = btrfs_header_bytenr(eb);
360 if (found_start != start) { 392 if (found_start != start) {
361 WARN_ON(1); 393 WARN_ON(1);
@@ -369,8 +401,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
369 WARN_ON(1); 401 WARN_ON(1);
370 goto err; 402 goto err;
371 } 403 }
372 found_level = btrfs_header_level(eb);
373
374 csum_tree_block(root, eb, 0); 404 csum_tree_block(root, eb, 0);
375err: 405err:
376 free_extent_buffer(eb); 406 free_extent_buffer(eb);
@@ -397,6 +427,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
397 return ret; 427 return ret;
398} 428}
399 429
430#define CORRUPT(reason, eb, root, slot) \
431 printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
432 "root=%llu, slot=%d\n", reason, \
433 (unsigned long long)btrfs_header_bytenr(eb), \
434 (unsigned long long)root->objectid, slot)
435
436static noinline int check_leaf(struct btrfs_root *root,
437 struct extent_buffer *leaf)
438{
439 struct btrfs_key key;
440 struct btrfs_key leaf_key;
441 u32 nritems = btrfs_header_nritems(leaf);
442 int slot;
443
444 if (nritems == 0)
445 return 0;
446
447 /* Check the 0 item */
448 if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
449 BTRFS_LEAF_DATA_SIZE(root)) {
450 CORRUPT("invalid item offset size pair", leaf, root, 0);
451 return -EIO;
452 }
453
454 /*
455 * Check to make sure each items keys are in the correct order and their
456 * offsets make sense. We only have to loop through nritems-1 because
457 * we check the current slot against the next slot, which verifies the
458 * next slot's offset+size makes sense and that the current's slot
459 * offset is correct.
460 */
461 for (slot = 0; slot < nritems - 1; slot++) {
462 btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
463 btrfs_item_key_to_cpu(leaf, &key, slot + 1);
464
465 /* Make sure the keys are in the right order */
466 if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
467 CORRUPT("bad key order", leaf, root, slot);
468 return -EIO;
469 }
470
471 /*
472 * Make sure the offset and ends are right, remember that the
473 * item data starts at the end of the leaf and grows towards the
474 * front.
475 */
476 if (btrfs_item_offset_nr(leaf, slot) !=
477 btrfs_item_end_nr(leaf, slot + 1)) {
478 CORRUPT("slot offset bad", leaf, root, slot);
479 return -EIO;
480 }
481
482 /*
483 * Check to make sure that we don't point outside of the leaf,
484 * just incase all the items are consistent to eachother, but
485 * all point outside of the leaf.
486 */
487 if (btrfs_item_end_nr(leaf, slot) >
488 BTRFS_LEAF_DATA_SIZE(root)) {
489 CORRUPT("slot end outside of leaf", leaf, root, slot);
490 return -EIO;
491 }
492 }
493
494 return 0;
495}
496
400#ifdef CONFIG_DEBUG_LOCK_ALLOC 497#ifdef CONFIG_DEBUG_LOCK_ALLOC
401void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) 498void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
402{ 499{
@@ -426,16 +523,18 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
426 len = page->private >> 2; 523 len = page->private >> 2;
427 WARN_ON(len == 0); 524 WARN_ON(len == 0);
428 525
429 eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); 526 eb = alloc_extent_buffer(tree, start, len, page);
527 if (eb == NULL) {
528 ret = -EIO;
529 goto out;
530 }
430 531
431 found_start = btrfs_header_bytenr(eb); 532 found_start = btrfs_header_bytenr(eb);
432 if (found_start != start) { 533 if (found_start != start) {
433 if (printk_ratelimit()) { 534 printk_ratelimited(KERN_INFO "btrfs bad tree block start "
434 printk(KERN_INFO "btrfs bad tree block start "
435 "%llu %llu\n", 535 "%llu %llu\n",
436 (unsigned long long)found_start, 536 (unsigned long long)found_start,
437 (unsigned long long)eb->start); 537 (unsigned long long)eb->start);
438 }
439 ret = -EIO; 538 ret = -EIO;
440 goto err; 539 goto err;
441 } 540 }
@@ -447,10 +546,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
447 goto err; 546 goto err;
448 } 547 }
449 if (check_tree_block_fsid(root, eb)) { 548 if (check_tree_block_fsid(root, eb)) {
450 if (printk_ratelimit()) { 549 printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
451 printk(KERN_INFO "btrfs bad fsid on block %llu\n",
452 (unsigned long long)eb->start); 550 (unsigned long long)eb->start);
453 }
454 ret = -EIO; 551 ret = -EIO;
455 goto err; 552 goto err;
456 } 553 }
@@ -459,8 +556,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
459 btrfs_set_buffer_lockdep_class(eb, found_level); 556 btrfs_set_buffer_lockdep_class(eb, found_level);
460 557
461 ret = csum_tree_block(root, eb, 1); 558 ret = csum_tree_block(root, eb, 1);
462 if (ret) 559 if (ret) {
463 ret = -EIO; 560 ret = -EIO;
561 goto err;
562 }
563
564 /*
565 * If this is a leaf block and it is corrupt, set the corrupt bit so
566 * that we don't try and read the other copies of this block, just
567 * return -EIO.
568 */
569 if (found_level == 0 && check_leaf(root, eb)) {
570 set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
571 ret = -EIO;
572 }
464 573
465 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 574 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
466 end = eb->start + end - 1; 575 end = eb->start + end - 1;
@@ -481,9 +590,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
481 end_io_wq->work.flags = 0; 590 end_io_wq->work.flags = 0;
482 591
483 if (bio->bi_rw & REQ_WRITE) { 592 if (bio->bi_rw & REQ_WRITE) {
484 if (end_io_wq->metadata) 593 if (end_io_wq->metadata == 1)
485 btrfs_queue_worker(&fs_info->endio_meta_write_workers, 594 btrfs_queue_worker(&fs_info->endio_meta_write_workers,
486 &end_io_wq->work); 595 &end_io_wq->work);
596 else if (end_io_wq->metadata == 2)
597 btrfs_queue_worker(&fs_info->endio_freespace_worker,
598 &end_io_wq->work);
487 else 599 else
488 btrfs_queue_worker(&fs_info->endio_write_workers, 600 btrfs_queue_worker(&fs_info->endio_write_workers,
489 &end_io_wq->work); 601 &end_io_wq->work);
@@ -497,6 +609,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
497 } 609 }
498} 610}
499 611
612/*
613 * For the metadata arg you want
614 *
615 * 0 - if data
616 * 1 - if normal metadta
617 * 2 - if writing to the free space cache area
618 */
500int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 619int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
501 int metadata) 620 int metadata)
502{ 621{
@@ -525,19 +644,11 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
525 return 256 * limit; 644 return 256 * limit;
526} 645}
527 646
528int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
529{
530 return atomic_read(&info->nr_async_bios) >
531 btrfs_async_submit_limit(info);
532}
533
534static void run_one_async_start(struct btrfs_work *work) 647static void run_one_async_start(struct btrfs_work *work)
535{ 648{
536 struct btrfs_fs_info *fs_info;
537 struct async_submit_bio *async; 649 struct async_submit_bio *async;
538 650
539 async = container_of(work, struct async_submit_bio, work); 651 async = container_of(work, struct async_submit_bio, work);
540 fs_info = BTRFS_I(async->inode)->root->fs_info;
541 async->submit_bio_start(async->inode, async->rw, async->bio, 652 async->submit_bio_start(async->inode, async->rw, async->bio,
542 async->mirror_num, async->bio_flags, 653 async->mirror_num, async->bio_flags,
543 async->bio_offset); 654 async->bio_offset);
@@ -688,6 +799,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
688 __btree_submit_bio_done); 799 __btree_submit_bio_done);
689} 800}
690 801
802#ifdef CONFIG_MIGRATION
803static int btree_migratepage(struct address_space *mapping,
804 struct page *newpage, struct page *page)
805{
806 /*
807 * we can't safely write a btree page from here,
808 * we haven't done the locking hook
809 */
810 if (PageDirty(page))
811 return -EAGAIN;
812 /*
813 * Buffers may be managed in a filesystem specific way.
814 * We must have no buffers or drop them.
815 */
816 if (page_has_private(page) &&
817 !try_to_release_page(page, GFP_KERNEL))
818 return -EAGAIN;
819 return migrate_page(mapping, newpage, page);
820}
821#endif
822
691static int btree_writepage(struct page *page, struct writeback_control *wbc) 823static int btree_writepage(struct page *page, struct writeback_control *wbc)
692{ 824{
693 struct extent_io_tree *tree; 825 struct extent_io_tree *tree;
@@ -702,8 +834,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
702 } 834 }
703 835
704 redirty_page_for_writepage(wbc, page); 836 redirty_page_for_writepage(wbc, page);
705 eb = btrfs_find_tree_block(root, page_offset(page), 837 eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
706 PAGE_CACHE_SIZE);
707 WARN_ON(!eb); 838 WARN_ON(!eb);
708 839
709 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); 840 was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -793,7 +924,9 @@ static const struct address_space_operations btree_aops = {
793 .writepages = btree_writepages, 924 .writepages = btree_writepages,
794 .releasepage = btree_releasepage, 925 .releasepage = btree_releasepage,
795 .invalidatepage = btree_invalidatepage, 926 .invalidatepage = btree_invalidatepage,
796 .sync_page = block_sync_page, 927#ifdef CONFIG_MIGRATION
928 .migratepage = btree_migratepage,
929#endif
797}; 930};
798 931
799int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 932int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -818,7 +951,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
818 struct inode *btree_inode = root->fs_info->btree_inode; 951 struct inode *btree_inode = root->fs_info->btree_inode;
819 struct extent_buffer *eb; 952 struct extent_buffer *eb;
820 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 953 eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
821 bytenr, blocksize, GFP_NOFS); 954 bytenr, blocksize);
822 return eb; 955 return eb;
823} 956}
824 957
@@ -829,7 +962,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
829 struct extent_buffer *eb; 962 struct extent_buffer *eb;
830 963
831 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, 964 eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
832 bytenr, blocksize, NULL, GFP_NOFS); 965 bytenr, blocksize, NULL);
833 return eb; 966 return eb;
834} 967}
835 968
@@ -850,12 +983,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
850 u32 blocksize, u64 parent_transid) 983 u32 blocksize, u64 parent_transid)
851{ 984{
852 struct extent_buffer *buf = NULL; 985 struct extent_buffer *buf = NULL;
853 struct inode *btree_inode = root->fs_info->btree_inode;
854 struct extent_io_tree *io_tree;
855 int ret; 986 int ret;
856 987
857 io_tree = &BTRFS_I(btree_inode)->io_tree;
858
859 buf = btrfs_find_create_tree_block(root, bytenr, blocksize); 988 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
860 if (!buf) 989 if (!buf)
861 return NULL; 990 return NULL;
@@ -915,15 +1044,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
915 root->last_trans = 0; 1044 root->last_trans = 0;
916 root->highest_objectid = 0; 1045 root->highest_objectid = 0;
917 root->name = NULL; 1046 root->name = NULL;
918 root->in_sysfs = 0;
919 root->inode_tree = RB_ROOT; 1047 root->inode_tree = RB_ROOT;
1048 INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
920 root->block_rsv = NULL; 1049 root->block_rsv = NULL;
921 root->orphan_block_rsv = NULL; 1050 root->orphan_block_rsv = NULL;
922 1051
923 INIT_LIST_HEAD(&root->dirty_list); 1052 INIT_LIST_HEAD(&root->dirty_list);
924 INIT_LIST_HEAD(&root->orphan_list); 1053 INIT_LIST_HEAD(&root->orphan_list);
925 INIT_LIST_HEAD(&root->root_list); 1054 INIT_LIST_HEAD(&root->root_list);
926 spin_lock_init(&root->node_lock);
927 spin_lock_init(&root->orphan_lock); 1055 spin_lock_init(&root->orphan_lock);
928 spin_lock_init(&root->inode_lock); 1056 spin_lock_init(&root->inode_lock);
929 spin_lock_init(&root->accounting_lock); 1057 spin_lock_init(&root->accounting_lock);
@@ -939,7 +1067,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
939 root->log_transid = 0; 1067 root->log_transid = 0;
940 root->last_log_commit = 0; 1068 root->last_log_commit = 0;
941 extent_io_tree_init(&root->dirty_log_pages, 1069 extent_io_tree_init(&root->dirty_log_pages,
942 fs_info->btree_inode->i_mapping, GFP_NOFS); 1070 fs_info->btree_inode->i_mapping);
943 1071
944 memset(&root->root_key, 0, sizeof(root->root_key)); 1072 memset(&root->root_key, 0, sizeof(root->root_key));
945 memset(&root->root_item, 0, sizeof(root->root_item)); 1073 memset(&root->root_item, 0, sizeof(root->root_item));
@@ -980,7 +1108,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
980 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1108 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
981 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1109 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
982 blocksize, generation); 1110 blocksize, generation);
983 BUG_ON(!root->node); 1111 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1112 free_extent_buffer(root->node);
1113 return -EIO;
1114 }
984 root->commit_root = btrfs_root_node(root); 1115 root->commit_root = btrfs_root_node(root);
985 return 0; 1116 return 0;
986} 1117}
@@ -1104,7 +1235,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1104 root, fs_info, location->objectid); 1235 root, fs_info, location->objectid);
1105 1236
1106 path = btrfs_alloc_path(); 1237 path = btrfs_alloc_path();
1107 BUG_ON(!path); 1238 if (!path) {
1239 kfree(root);
1240 return ERR_PTR(-ENOMEM);
1241 }
1108 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); 1242 ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
1109 if (ret == 0) { 1243 if (ret == 0) {
1110 l = path->nodes[0]; 1244 l = path->nodes[0];
@@ -1115,6 +1249,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1115 } 1249 }
1116 btrfs_free_path(path); 1250 btrfs_free_path(path);
1117 if (ret) { 1251 if (ret) {
1252 kfree(root);
1118 if (ret > 0) 1253 if (ret > 0)
1119 ret = -ENOENT; 1254 ret = -ENOENT;
1120 return ERR_PTR(ret); 1255 return ERR_PTR(ret);
@@ -1127,27 +1262,14 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
1127 root->commit_root = btrfs_root_node(root); 1262 root->commit_root = btrfs_root_node(root);
1128 BUG_ON(!root->node); 1263 BUG_ON(!root->node);
1129out: 1264out:
1130 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) 1265 if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
1131 root->ref_cows = 1; 1266 root->ref_cows = 1;
1267 btrfs_check_and_init_root_item(&root->root_item);
1268 }
1132 1269
1133 return root; 1270 return root;
1134} 1271}
1135 1272
1136struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
1137 u64 root_objectid)
1138{
1139 struct btrfs_root *root;
1140
1141 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
1142 return fs_info->tree_root;
1143 if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
1144 return fs_info->extent_root;
1145
1146 root = radix_tree_lookup(&fs_info->fs_roots_radix,
1147 (unsigned long)root_objectid);
1148 return root;
1149}
1150
1151struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 1273struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
1152 struct btrfs_key *location) 1274 struct btrfs_key *location)
1153{ 1275{
@@ -1176,7 +1298,22 @@ again:
1176 if (IS_ERR(root)) 1298 if (IS_ERR(root))
1177 return root; 1299 return root;
1178 1300
1179 set_anon_super(&root->anon_super, NULL); 1301 root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
1302 root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
1303 GFP_NOFS);
1304 if (!root->free_ino_pinned || !root->free_ino_ctl) {
1305 ret = -ENOMEM;
1306 goto fail;
1307 }
1308
1309 btrfs_init_free_ino_ctl(root);
1310 mutex_init(&root->fs_commit_mutex);
1311 spin_lock_init(&root->cache_lock);
1312 init_waitqueue_head(&root->cache_wait);
1313
1314 ret = set_anon_super(&root->anon_super, NULL);
1315 if (ret)
1316 goto fail;
1180 1317
1181 if (btrfs_root_refs(&root->root_item) == 0) { 1318 if (btrfs_root_refs(&root->root_item) == 0) {
1182 ret = -ENOENT; 1319 ret = -ENOENT;
@@ -1219,41 +1356,6 @@ fail:
1219 return ERR_PTR(ret); 1356 return ERR_PTR(ret);
1220} 1357}
1221 1358
1222struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
1223 struct btrfs_key *location,
1224 const char *name, int namelen)
1225{
1226 return btrfs_read_fs_root_no_name(fs_info, location);
1227#if 0
1228 struct btrfs_root *root;
1229 int ret;
1230
1231 root = btrfs_read_fs_root_no_name(fs_info, location);
1232 if (!root)
1233 return NULL;
1234
1235 if (root->in_sysfs)
1236 return root;
1237
1238 ret = btrfs_set_root_name(root, name, namelen);
1239 if (ret) {
1240 free_extent_buffer(root->node);
1241 kfree(root);
1242 return ERR_PTR(ret);
1243 }
1244
1245 ret = btrfs_sysfs_add_root(root);
1246 if (ret) {
1247 free_extent_buffer(root->node);
1248 kfree(root->name);
1249 kfree(root);
1250 return ERR_PTR(ret);
1251 }
1252 root->in_sysfs = 1;
1253 return root;
1254#endif
1255}
1256
1257static int btrfs_congested_fn(void *congested_data, int bdi_bits) 1359static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1258{ 1360{
1259 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; 1361 struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
@@ -1261,7 +1363,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1261 struct btrfs_device *device; 1363 struct btrfs_device *device;
1262 struct backing_dev_info *bdi; 1364 struct backing_dev_info *bdi;
1263 1365
1264 list_for_each_entry(device, &info->fs_devices->devices, dev_list) { 1366 rcu_read_lock();
1367 list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
1265 if (!device->bdev) 1368 if (!device->bdev)
1266 continue; 1369 continue;
1267 bdi = blk_get_backing_dev_info(device->bdev); 1370 bdi = blk_get_backing_dev_info(device->bdev);
@@ -1270,86 +1373,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
1270 break; 1373 break;
1271 } 1374 }
1272 } 1375 }
1376 rcu_read_unlock();
1273 return ret; 1377 return ret;
1274} 1378}
1275 1379
1276/* 1380/*
1277 * this unplugs every device on the box, and it is only used when page
1278 * is null
1279 */
1280static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1281{
1282 struct btrfs_device *device;
1283 struct btrfs_fs_info *info;
1284
1285 info = (struct btrfs_fs_info *)bdi->unplug_io_data;
1286 list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
1287 if (!device->bdev)
1288 continue;
1289
1290 bdi = blk_get_backing_dev_info(device->bdev);
1291 if (bdi->unplug_io_fn)
1292 bdi->unplug_io_fn(bdi, page);
1293 }
1294}
1295
1296static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1297{
1298 struct inode *inode;
1299 struct extent_map_tree *em_tree;
1300 struct extent_map *em;
1301 struct address_space *mapping;
1302 u64 offset;
1303
1304 /* the generic O_DIRECT read code does this */
1305 if (1 || !page) {
1306 __unplug_io_fn(bdi, page);
1307 return;
1308 }
1309
1310 /*
1311 * page->mapping may change at any time. Get a consistent copy
1312 * and use that for everything below
1313 */
1314 smp_mb();
1315 mapping = page->mapping;
1316 if (!mapping)
1317 return;
1318
1319 inode = mapping->host;
1320
1321 /*
1322 * don't do the expensive searching for a small number of
1323 * devices
1324 */
1325 if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
1326 __unplug_io_fn(bdi, page);
1327 return;
1328 }
1329
1330 offset = page_offset(page);
1331
1332 em_tree = &BTRFS_I(inode)->extent_tree;
1333 read_lock(&em_tree->lock);
1334 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1335 read_unlock(&em_tree->lock);
1336 if (!em) {
1337 __unplug_io_fn(bdi, page);
1338 return;
1339 }
1340
1341 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1342 free_extent_map(em);
1343 __unplug_io_fn(bdi, page);
1344 return;
1345 }
1346 offset = offset - em->start;
1347 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1348 em->block_start + offset, page);
1349 free_extent_map(em);
1350}
1351
1352/*
1353 * If this fails, caller must call bdi_destroy() to get rid of the 1381 * If this fails, caller must call bdi_destroy() to get rid of the
1354 * bdi again. 1382 * bdi again.
1355 */ 1383 */
@@ -1363,8 +1391,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
1363 return err; 1391 return err;
1364 1392
1365 bdi->ra_pages = default_backing_dev_info.ra_pages; 1393 bdi->ra_pages = default_backing_dev_info.ra_pages;
1366 bdi->unplug_io_fn = btrfs_unplug_io_fn;
1367 bdi->unplug_io_data = info;
1368 bdi->congested_fn = btrfs_congested_fn; 1394 bdi->congested_fn = btrfs_congested_fn;
1369 bdi->congested_data = info; 1395 bdi->congested_data = info;
1370 return 0; 1396 return 0;
@@ -1377,7 +1403,6 @@ static int bio_ready_for_csum(struct bio *bio)
1377 u64 start = 0; 1403 u64 start = 0;
1378 struct page *page; 1404 struct page *page;
1379 struct extent_io_tree *io_tree = NULL; 1405 struct extent_io_tree *io_tree = NULL;
1380 struct btrfs_fs_info *info = NULL;
1381 struct bio_vec *bvec; 1406 struct bio_vec *bvec;
1382 int i; 1407 int i;
1383 int ret; 1408 int ret;
@@ -1396,7 +1421,6 @@ static int bio_ready_for_csum(struct bio *bio)
1396 buf_len = page->private >> 2; 1421 buf_len = page->private >> 2;
1397 start = page_offset(page) + bvec->bv_offset; 1422 start = page_offset(page) + bvec->bv_offset;
1398 io_tree = &BTRFS_I(page->mapping->host)->io_tree; 1423 io_tree = &BTRFS_I(page->mapping->host)->io_tree;
1399 info = BTRFS_I(page->mapping->host)->root->fs_info;
1400 } 1424 }
1401 /* are we fully contained in this bio? */ 1425 /* are we fully contained in this bio? */
1402 if (buf_len <= length) 1426 if (buf_len <= length)
@@ -1452,6 +1476,7 @@ static int cleaner_kthread(void *arg)
1452 btrfs_run_delayed_iputs(root); 1476 btrfs_run_delayed_iputs(root);
1453 btrfs_clean_old_snapshots(root); 1477 btrfs_clean_old_snapshots(root);
1454 mutex_unlock(&root->fs_info->cleaner_mutex); 1478 mutex_unlock(&root->fs_info->cleaner_mutex);
1479 btrfs_run_defrag_inodes(root->fs_info);
1455 } 1480 }
1456 1481
1457 if (freezing(current)) { 1482 if (freezing(current)) {
@@ -1481,24 +1506,25 @@ static int transaction_kthread(void *arg)
1481 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); 1506 vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
1482 mutex_lock(&root->fs_info->transaction_kthread_mutex); 1507 mutex_lock(&root->fs_info->transaction_kthread_mutex);
1483 1508
1484 spin_lock(&root->fs_info->new_trans_lock); 1509 spin_lock(&root->fs_info->trans_lock);
1485 cur = root->fs_info->running_transaction; 1510 cur = root->fs_info->running_transaction;
1486 if (!cur) { 1511 if (!cur) {
1487 spin_unlock(&root->fs_info->new_trans_lock); 1512 spin_unlock(&root->fs_info->trans_lock);
1488 goto sleep; 1513 goto sleep;
1489 } 1514 }
1490 1515
1491 now = get_seconds(); 1516 now = get_seconds();
1492 if (!cur->blocked && 1517 if (!cur->blocked &&
1493 (now < cur->start_time || now - cur->start_time < 30)) { 1518 (now < cur->start_time || now - cur->start_time < 30)) {
1494 spin_unlock(&root->fs_info->new_trans_lock); 1519 spin_unlock(&root->fs_info->trans_lock);
1495 delay = HZ * 5; 1520 delay = HZ * 5;
1496 goto sleep; 1521 goto sleep;
1497 } 1522 }
1498 transid = cur->transid; 1523 transid = cur->transid;
1499 spin_unlock(&root->fs_info->new_trans_lock); 1524 spin_unlock(&root->fs_info->trans_lock);
1500 1525
1501 trans = btrfs_join_transaction(root, 1); 1526 trans = btrfs_join_transaction(root);
1527 BUG_ON(IS_ERR(trans));
1502 if (transid == trans->transid) { 1528 if (transid == trans->transid) {
1503 ret = btrfs_commit_transaction(trans, root); 1529 ret = btrfs_commit_transaction(trans, root);
1504 BUG_ON(ret); 1530 BUG_ON(ret);
@@ -1539,10 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1539 GFP_NOFS); 1565 GFP_NOFS);
1540 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), 1566 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1541 GFP_NOFS); 1567 GFP_NOFS);
1542 struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), 1568 struct btrfs_root *tree_root = btrfs_sb(sb);
1543 GFP_NOFS); 1569 struct btrfs_fs_info *fs_info = NULL;
1544 struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
1545 GFP_NOFS);
1546 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1570 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
1547 GFP_NOFS); 1571 GFP_NOFS);
1548 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1572 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1554,11 +1578,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1554 1578
1555 struct btrfs_super_block *disk_super; 1579 struct btrfs_super_block *disk_super;
1556 1580
1557 if (!extent_root || !tree_root || !fs_info || 1581 if (!extent_root || !tree_root || !tree_root->fs_info ||
1558 !chunk_root || !dev_root || !csum_root) { 1582 !chunk_root || !dev_root || !csum_root) {
1559 err = -ENOMEM; 1583 err = -ENOMEM;
1560 goto fail; 1584 goto fail;
1561 } 1585 }
1586 fs_info = tree_root->fs_info;
1562 1587
1563 ret = init_srcu_struct(&fs_info->subvol_srcu); 1588 ret = init_srcu_struct(&fs_info->subvol_srcu);
1564 if (ret) { 1589 if (ret) {
@@ -1578,6 +1603,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1578 goto fail_bdi; 1603 goto fail_bdi;
1579 } 1604 }
1580 1605
1606 fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
1607
1581 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); 1608 INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
1582 INIT_LIST_HEAD(&fs_info->trans_list); 1609 INIT_LIST_HEAD(&fs_info->trans_list);
1583 INIT_LIST_HEAD(&fs_info->dead_roots); 1610 INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1587,10 +1614,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1587 INIT_LIST_HEAD(&fs_info->ordered_operations); 1614 INIT_LIST_HEAD(&fs_info->ordered_operations);
1588 INIT_LIST_HEAD(&fs_info->caching_block_groups); 1615 INIT_LIST_HEAD(&fs_info->caching_block_groups);
1589 spin_lock_init(&fs_info->delalloc_lock); 1616 spin_lock_init(&fs_info->delalloc_lock);
1590 spin_lock_init(&fs_info->new_trans_lock); 1617 spin_lock_init(&fs_info->trans_lock);
1591 spin_lock_init(&fs_info->ref_cache_lock); 1618 spin_lock_init(&fs_info->ref_cache_lock);
1592 spin_lock_init(&fs_info->fs_roots_radix_lock); 1619 spin_lock_init(&fs_info->fs_roots_radix_lock);
1593 spin_lock_init(&fs_info->delayed_iput_lock); 1620 spin_lock_init(&fs_info->delayed_iput_lock);
1621 spin_lock_init(&fs_info->defrag_inodes_lock);
1622 mutex_init(&fs_info->reloc_mutex);
1594 1623
1595 init_completion(&fs_info->kobj_unregister); 1624 init_completion(&fs_info->kobj_unregister);
1596 fs_info->tree_root = tree_root; 1625 fs_info->tree_root = tree_root;
@@ -1613,15 +1642,34 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1613 atomic_set(&fs_info->async_delalloc_pages, 0); 1642 atomic_set(&fs_info->async_delalloc_pages, 0);
1614 atomic_set(&fs_info->async_submit_draining, 0); 1643 atomic_set(&fs_info->async_submit_draining, 0);
1615 atomic_set(&fs_info->nr_async_bios, 0); 1644 atomic_set(&fs_info->nr_async_bios, 0);
1645 atomic_set(&fs_info->defrag_running, 0);
1616 fs_info->sb = sb; 1646 fs_info->sb = sb;
1617 fs_info->max_inline = 8192 * 1024; 1647 fs_info->max_inline = 8192 * 1024;
1618 fs_info->metadata_ratio = 0; 1648 fs_info->metadata_ratio = 0;
1649 fs_info->defrag_inodes = RB_ROOT;
1650 fs_info->trans_no_join = 0;
1619 1651
1620 fs_info->thread_pool_size = min_t(unsigned long, 1652 fs_info->thread_pool_size = min_t(unsigned long,
1621 num_online_cpus() + 2, 8); 1653 num_online_cpus() + 2, 8);
1622 1654
1623 INIT_LIST_HEAD(&fs_info->ordered_extents); 1655 INIT_LIST_HEAD(&fs_info->ordered_extents);
1624 spin_lock_init(&fs_info->ordered_extent_lock); 1656 spin_lock_init(&fs_info->ordered_extent_lock);
1657 fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
1658 GFP_NOFS);
1659 if (!fs_info->delayed_root) {
1660 err = -ENOMEM;
1661 goto fail_iput;
1662 }
1663 btrfs_init_delayed_root(fs_info->delayed_root);
1664
1665 mutex_init(&fs_info->scrub_lock);
1666 atomic_set(&fs_info->scrubs_running, 0);
1667 atomic_set(&fs_info->scrub_pause_req, 0);
1668 atomic_set(&fs_info->scrubs_paused, 0);
1669 atomic_set(&fs_info->scrub_cancel_req, 0);
1670 init_waitqueue_head(&fs_info->scrub_pause_wait);
1671 init_rwsem(&fs_info->scrub_super_lock);
1672 fs_info->scrub_workers_refcnt = 0;
1625 1673
1626 sb->s_blocksize = 4096; 1674 sb->s_blocksize = 4096;
1627 sb->s_blocksize_bits = blksize_bits(4096); 1675 sb->s_blocksize_bits = blksize_bits(4096);
@@ -1640,10 +1688,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1640 1688
1641 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node); 1689 RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
1642 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, 1690 extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
1643 fs_info->btree_inode->i_mapping, 1691 fs_info->btree_inode->i_mapping);
1644 GFP_NOFS); 1692 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
1645 extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
1646 GFP_NOFS);
1647 1693
1648 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; 1694 BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
1649 1695
@@ -1657,14 +1703,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1657 fs_info->block_group_cache_tree = RB_ROOT; 1703 fs_info->block_group_cache_tree = RB_ROOT;
1658 1704
1659 extent_io_tree_init(&fs_info->freed_extents[0], 1705 extent_io_tree_init(&fs_info->freed_extents[0],
1660 fs_info->btree_inode->i_mapping, GFP_NOFS); 1706 fs_info->btree_inode->i_mapping);
1661 extent_io_tree_init(&fs_info->freed_extents[1], 1707 extent_io_tree_init(&fs_info->freed_extents[1],
1662 fs_info->btree_inode->i_mapping, GFP_NOFS); 1708 fs_info->btree_inode->i_mapping);
1663 fs_info->pinned_extents = &fs_info->freed_extents[0]; 1709 fs_info->pinned_extents = &fs_info->freed_extents[0];
1664 fs_info->do_barriers = 1; 1710 fs_info->do_barriers = 1;
1665 1711
1666 1712
1667 mutex_init(&fs_info->trans_mutex);
1668 mutex_init(&fs_info->ordered_operations_mutex); 1713 mutex_init(&fs_info->ordered_operations_mutex);
1669 mutex_init(&fs_info->tree_log_mutex); 1714 mutex_init(&fs_info->tree_log_mutex);
1670 mutex_init(&fs_info->chunk_mutex); 1715 mutex_init(&fs_info->chunk_mutex);
@@ -1680,15 +1725,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1680 1725
1681 init_waitqueue_head(&fs_info->transaction_throttle); 1726 init_waitqueue_head(&fs_info->transaction_throttle);
1682 init_waitqueue_head(&fs_info->transaction_wait); 1727 init_waitqueue_head(&fs_info->transaction_wait);
1728 init_waitqueue_head(&fs_info->transaction_blocked_wait);
1683 init_waitqueue_head(&fs_info->async_submit_wait); 1729 init_waitqueue_head(&fs_info->async_submit_wait);
1684 1730
1685 __setup_root(4096, 4096, 4096, 4096, tree_root, 1731 __setup_root(4096, 4096, 4096, 4096, tree_root,
1686 fs_info, BTRFS_ROOT_TREE_OBJECTID); 1732 fs_info, BTRFS_ROOT_TREE_OBJECTID);
1687 1733
1688
1689 bh = btrfs_read_dev_super(fs_devices->latest_bdev); 1734 bh = btrfs_read_dev_super(fs_devices->latest_bdev);
1690 if (!bh) 1735 if (!bh) {
1691 goto fail_iput; 1736 err = -EINVAL;
1737 goto fail_alloc;
1738 }
1692 1739
1693 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 1740 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
1694 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 1741 memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1699,12 +1746,23 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1699 1746
1700 disk_super = &fs_info->super_copy; 1747 disk_super = &fs_info->super_copy;
1701 if (!btrfs_super_root(disk_super)) 1748 if (!btrfs_super_root(disk_super))
1702 goto fail_iput; 1749 goto fail_alloc;
1750
1751 /* check FS state, whether FS is broken. */
1752 fs_info->fs_state |= btrfs_super_flags(disk_super);
1753
1754 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1755
1756 /*
1757 * In the long term, we'll store the compression type in the super
1758 * block, and it'll be used for per file compression control.
1759 */
1760 fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
1703 1761
1704 ret = btrfs_parse_options(tree_root, options); 1762 ret = btrfs_parse_options(tree_root, options);
1705 if (ret) { 1763 if (ret) {
1706 err = ret; 1764 err = ret;
1707 goto fail_iput; 1765 goto fail_alloc;
1708 } 1766 }
1709 1767
1710 features = btrfs_super_incompat_flags(disk_super) & 1768 features = btrfs_super_incompat_flags(disk_super) &
@@ -1714,14 +1772,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1714 "unsupported optional features (%Lx).\n", 1772 "unsupported optional features (%Lx).\n",
1715 (unsigned long long)features); 1773 (unsigned long long)features);
1716 err = -EINVAL; 1774 err = -EINVAL;
1717 goto fail_iput; 1775 goto fail_alloc;
1718 } 1776 }
1719 1777
1720 features = btrfs_super_incompat_flags(disk_super); 1778 features = btrfs_super_incompat_flags(disk_super);
1721 if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) { 1779 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
1722 features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; 1780 if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
1723 btrfs_set_super_incompat_flags(disk_super, features); 1781 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1724 } 1782 btrfs_set_super_incompat_flags(disk_super, features);
1725 1783
1726 features = btrfs_super_compat_ro_flags(disk_super) & 1784 features = btrfs_super_compat_ro_flags(disk_super) &
1727 ~BTRFS_FEATURE_COMPAT_RO_SUPP; 1785 ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1730,7 +1788,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1730 "unsupported option features (%Lx).\n", 1788 "unsupported option features (%Lx).\n",
1731 (unsigned long long)features); 1789 (unsigned long long)features);
1732 err = -EINVAL; 1790 err = -EINVAL;
1733 goto fail_iput; 1791 goto fail_alloc;
1734 } 1792 }
1735 1793
1736 btrfs_init_workers(&fs_info->generic_worker, 1794 btrfs_init_workers(&fs_info->generic_worker,
@@ -1775,6 +1833,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1775 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", 1833 btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
1776 fs_info->thread_pool_size, 1834 fs_info->thread_pool_size,
1777 &fs_info->generic_worker); 1835 &fs_info->generic_worker);
1836 btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
1837 1, &fs_info->generic_worker);
1838 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1839 fs_info->thread_pool_size,
1840 &fs_info->generic_worker);
1778 1841
1779 /* 1842 /*
1780 * endios are largely parallel and should have a very 1843 * endios are largely parallel and should have a very
@@ -1795,6 +1858,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1795 btrfs_start_workers(&fs_info->endio_meta_workers, 1); 1858 btrfs_start_workers(&fs_info->endio_meta_workers, 1);
1796 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1); 1859 btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
1797 btrfs_start_workers(&fs_info->endio_write_workers, 1); 1860 btrfs_start_workers(&fs_info->endio_write_workers, 1);
1861 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1862 btrfs_start_workers(&fs_info->delayed_workers, 1);
1798 1863
1799 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 1864 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1800 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 1865 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1903,6 +1968,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1903 fs_info->metadata_alloc_profile = (u64)-1; 1968 fs_info->metadata_alloc_profile = (u64)-1;
1904 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; 1969 fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
1905 1970
1971 ret = btrfs_init_space_info(fs_info);
1972 if (ret) {
1973 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
1974 goto fail_block_groups;
1975 }
1976
1906 ret = btrfs_read_block_groups(extent_root); 1977 ret = btrfs_read_block_groups(extent_root);
1907 if (ret) { 1978 if (ret) {
1908 printk(KERN_ERR "Failed to read block groups: %d\n", ret); 1979 printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -1928,7 +1999,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1928 btrfs_set_opt(fs_info->mount_opt, SSD); 1999 btrfs_set_opt(fs_info->mount_opt, SSD);
1929 } 2000 }
1930 2001
1931 if (btrfs_super_log_root(disk_super) != 0) { 2002 /* do not make disk changes in broken FS */
2003 if (btrfs_super_log_root(disk_super) != 0 &&
2004 !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
1932 u64 bytenr = btrfs_super_log_root(disk_super); 2005 u64 bytenr = btrfs_super_log_root(disk_super);
1933 2006
1934 if (fs_devices->rw_devices == 0) { 2007 if (fs_devices->rw_devices == 0) {
@@ -1992,8 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1992 2065
1993 if (!(sb->s_flags & MS_RDONLY)) { 2066 if (!(sb->s_flags & MS_RDONLY)) {
1994 down_read(&fs_info->cleanup_work_sem); 2067 down_read(&fs_info->cleanup_work_sem);
1995 btrfs_orphan_cleanup(fs_info->fs_root); 2068 err = btrfs_orphan_cleanup(fs_info->fs_root);
2069 if (!err)
2070 err = btrfs_orphan_cleanup(fs_info->tree_root);
1996 up_read(&fs_info->cleanup_work_sem); 2071 up_read(&fs_info->cleanup_work_sem);
2072 if (err) {
2073 close_ctree(tree_root);
2074 return ERR_PTR(err);
2075 }
1997 } 2076 }
1998 2077
1999 return tree_root; 2078 return tree_root;
@@ -2035,7 +2114,11 @@ fail_sb_buffer:
2035 btrfs_stop_workers(&fs_info->endio_meta_workers); 2114 btrfs_stop_workers(&fs_info->endio_meta_workers);
2036 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2115 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2037 btrfs_stop_workers(&fs_info->endio_write_workers); 2116 btrfs_stop_workers(&fs_info->endio_write_workers);
2117 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2038 btrfs_stop_workers(&fs_info->submit_workers); 2118 btrfs_stop_workers(&fs_info->submit_workers);
2119 btrfs_stop_workers(&fs_info->delayed_workers);
2120fail_alloc:
2121 kfree(fs_info->delayed_root);
2039fail_iput: 2122fail_iput:
2040 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2123 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2041 iput(fs_info->btree_inode); 2124 iput(fs_info->btree_inode);
@@ -2063,11 +2146,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
2063 if (uptodate) { 2146 if (uptodate) {
2064 set_buffer_uptodate(bh); 2147 set_buffer_uptodate(bh);
2065 } else { 2148 } else {
2066 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { 2149 printk_ratelimited(KERN_WARNING "lost page write due to "
2067 printk(KERN_WARNING "lost page write due to "
2068 "I/O error on %s\n", 2150 "I/O error on %s\n",
2069 bdevname(bh->b_bdev, b)); 2151 bdevname(bh->b_bdev, b));
2070 }
2071 /* note, we dont' set_buffer_write_io_error because we have 2152 /* note, we dont' set_buffer_write_io_error because we have
2072 * our own ways of dealing with the IO errors 2153 * our own ways of dealing with the IO errors
2073 */ 2154 */
@@ -2200,21 +2281,10 @@ static int write_dev_supers(struct btrfs_device *device,
2200 bh->b_end_io = btrfs_end_buffer_write_sync; 2281 bh->b_end_io = btrfs_end_buffer_write_sync;
2201 } 2282 }
2202 2283
2203 if (i == last_barrier && do_barriers && device->barriers) { 2284 if (i == last_barrier && do_barriers)
2204 ret = submit_bh(WRITE_BARRIER, bh); 2285 ret = submit_bh(WRITE_FLUSH_FUA, bh);
2205 if (ret == -EOPNOTSUPP) { 2286 else
2206 printk("btrfs: disabling barriers on dev %s\n",
2207 device->name);
2208 set_buffer_uptodate(bh);
2209 device->barriers = 0;
2210 /* one reference for submit_bh */
2211 get_bh(bh);
2212 lock_buffer(bh);
2213 ret = submit_bh(WRITE_SYNC, bh);
2214 }
2215 } else {
2216 ret = submit_bh(WRITE_SYNC, bh); 2287 ret = submit_bh(WRITE_SYNC, bh);
2217 }
2218 2288
2219 if (ret) 2289 if (ret)
2220 errors++; 2290 errors++;
@@ -2242,7 +2312,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2242 2312
2243 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2313 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2244 head = &root->fs_info->fs_devices->devices; 2314 head = &root->fs_info->fs_devices->devices;
2245 list_for_each_entry(dev, head, dev_list) { 2315 list_for_each_entry_rcu(dev, head, dev_list) {
2246 if (!dev->bdev) { 2316 if (!dev->bdev) {
2247 total_errors++; 2317 total_errors++;
2248 continue; 2318 continue;
@@ -2275,7 +2345,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2275 } 2345 }
2276 2346
2277 total_errors = 0; 2347 total_errors = 0;
2278 list_for_each_entry(dev, head, dev_list) { 2348 list_for_each_entry_rcu(dev, head, dev_list) {
2279 if (!dev->bdev) 2349 if (!dev->bdev)
2280 continue; 2350 continue;
2281 if (!dev->in_fs_metadata || !dev->writeable) 2351 if (!dev->in_fs_metadata || !dev->writeable)
@@ -2313,12 +2383,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2313 if (btrfs_root_refs(&root->root_item) == 0) 2383 if (btrfs_root_refs(&root->root_item) == 0)
2314 synchronize_srcu(&fs_info->subvol_srcu); 2384 synchronize_srcu(&fs_info->subvol_srcu);
2315 2385
2386 __btrfs_remove_free_space_cache(root->free_ino_pinned);
2387 __btrfs_remove_free_space_cache(root->free_ino_ctl);
2316 free_fs_root(root); 2388 free_fs_root(root);
2317 return 0; 2389 return 0;
2318} 2390}
2319 2391
2320static void free_fs_root(struct btrfs_root *root) 2392static void free_fs_root(struct btrfs_root *root)
2321{ 2393{
2394 iput(root->cache_inode);
2322 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); 2395 WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
2323 if (root->anon_super.s_dev) { 2396 if (root->anon_super.s_dev) {
2324 down_write(&root->anon_super.s_umount); 2397 down_write(&root->anon_super.s_umount);
@@ -2326,6 +2399,8 @@ static void free_fs_root(struct btrfs_root *root)
2326 } 2399 }
2327 free_extent_buffer(root->node); 2400 free_extent_buffer(root->node);
2328 free_extent_buffer(root->commit_root); 2401 free_extent_buffer(root->commit_root);
2402 kfree(root->free_ino_ctl);
2403 kfree(root->free_ino_pinned);
2329 kfree(root->name); 2404 kfree(root->name);
2330 kfree(root); 2405 kfree(root);
2331} 2406}
@@ -2378,8 +2453,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
2378 2453
2379 root_objectid = gang[ret - 1]->root_key.objectid + 1; 2454 root_objectid = gang[ret - 1]->root_key.objectid + 1;
2380 for (i = 0; i < ret; i++) { 2455 for (i = 0; i < ret; i++) {
2456 int err;
2457
2381 root_objectid = gang[i]->root_key.objectid; 2458 root_objectid = gang[i]->root_key.objectid;
2382 btrfs_orphan_cleanup(gang[i]); 2459 err = btrfs_orphan_cleanup(gang[i]);
2460 if (err)
2461 return err;
2383 } 2462 }
2384 root_objectid++; 2463 root_objectid++;
2385 } 2464 }
@@ -2400,11 +2479,15 @@ int btrfs_commit_super(struct btrfs_root *root)
2400 down_write(&root->fs_info->cleanup_work_sem); 2479 down_write(&root->fs_info->cleanup_work_sem);
2401 up_write(&root->fs_info->cleanup_work_sem); 2480 up_write(&root->fs_info->cleanup_work_sem);
2402 2481
2403 trans = btrfs_join_transaction(root, 1); 2482 trans = btrfs_join_transaction(root);
2483 if (IS_ERR(trans))
2484 return PTR_ERR(trans);
2404 ret = btrfs_commit_transaction(trans, root); 2485 ret = btrfs_commit_transaction(trans, root);
2405 BUG_ON(ret); 2486 BUG_ON(ret);
2406 /* run commit again to drop the original snapshot */ 2487 /* run commit again to drop the original snapshot */
2407 trans = btrfs_join_transaction(root, 1); 2488 trans = btrfs_join_transaction(root);
2489 if (IS_ERR(trans))
2490 return PTR_ERR(trans);
2408 btrfs_commit_transaction(trans, root); 2491 btrfs_commit_transaction(trans, root);
2409 ret = btrfs_write_and_wait_transaction(NULL, root); 2492 ret = btrfs_write_and_wait_transaction(NULL, root);
2410 BUG_ON(ret); 2493 BUG_ON(ret);
@@ -2421,8 +2504,38 @@ int close_ctree(struct btrfs_root *root)
2421 fs_info->closing = 1; 2504 fs_info->closing = 1;
2422 smp_mb(); 2505 smp_mb();
2423 2506
2507 btrfs_scrub_cancel(root);
2508
2509 /* wait for any defraggers to finish */
2510 wait_event(fs_info->transaction_wait,
2511 (atomic_read(&fs_info->defrag_running) == 0));
2512
2513 /* clear out the rbtree of defraggable inodes */
2514 btrfs_run_defrag_inodes(root->fs_info);
2515
2516 btrfs_put_block_group_cache(fs_info);
2517
2518 /*
2519 * Here come 2 situations when btrfs is broken to flip readonly:
2520 *
2521 * 1. when btrfs flips readonly somewhere else before
2522 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
2523 * and btrfs will skip to write sb directly to keep
2524 * ERROR state on disk.
2525 *
2526 * 2. when btrfs flips readonly just in btrfs_commit_super,
2527 * and in such case, btrfs cannot write sb via btrfs_commit_super,
2528 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
2529 * btrfs will cleanup all FS resources first and write sb then.
2530 */
2424 if (!(fs_info->sb->s_flags & MS_RDONLY)) { 2531 if (!(fs_info->sb->s_flags & MS_RDONLY)) {
2425 ret = btrfs_commit_super(root); 2532 ret = btrfs_commit_super(root);
2533 if (ret)
2534 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2535 }
2536
2537 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
2538 ret = btrfs_error_commit_super(root);
2426 if (ret) 2539 if (ret)
2427 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2540 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2428 } 2541 }
@@ -2458,6 +2571,7 @@ int close_ctree(struct btrfs_root *root)
2458 del_fs_roots(fs_info); 2571 del_fs_roots(fs_info);
2459 2572
2460 iput(fs_info->btree_inode); 2573 iput(fs_info->btree_inode);
2574 kfree(fs_info->delayed_root);
2461 2575
2462 btrfs_stop_workers(&fs_info->generic_worker); 2576 btrfs_stop_workers(&fs_info->generic_worker);
2463 btrfs_stop_workers(&fs_info->fixup_workers); 2577 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2467,7 +2581,9 @@ int close_ctree(struct btrfs_root *root)
2467 btrfs_stop_workers(&fs_info->endio_meta_workers); 2581 btrfs_stop_workers(&fs_info->endio_meta_workers);
2468 btrfs_stop_workers(&fs_info->endio_meta_write_workers); 2582 btrfs_stop_workers(&fs_info->endio_meta_write_workers);
2469 btrfs_stop_workers(&fs_info->endio_write_workers); 2583 btrfs_stop_workers(&fs_info->endio_write_workers);
2584 btrfs_stop_workers(&fs_info->endio_freespace_worker);
2470 btrfs_stop_workers(&fs_info->submit_workers); 2585 btrfs_stop_workers(&fs_info->submit_workers);
2586 btrfs_stop_workers(&fs_info->delayed_workers);
2471 2587
2472 btrfs_close_devices(fs_info->fs_devices); 2588 btrfs_close_devices(fs_info->fs_devices);
2473 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2589 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2480,6 +2596,8 @@ int close_ctree(struct btrfs_root *root)
2480 kfree(fs_info->chunk_root); 2596 kfree(fs_info->chunk_root);
2481 kfree(fs_info->dev_root); 2597 kfree(fs_info->dev_root);
2482 kfree(fs_info->csum_root); 2598 kfree(fs_info->csum_root);
2599 kfree(fs_info);
2600
2483 return 0; 2601 return 0;
2484} 2602}
2485 2603
@@ -2542,6 +2660,29 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2542 if (current->flags & PF_MEMALLOC) 2660 if (current->flags & PF_MEMALLOC)
2543 return; 2661 return;
2544 2662
2663 btrfs_balance_delayed_items(root);
2664
2665 num_dirty = root->fs_info->dirty_metadata_bytes;
2666
2667 if (num_dirty > thresh) {
2668 balance_dirty_pages_ratelimited_nr(
2669 root->fs_info->btree_inode->i_mapping, 1);
2670 }
2671 return;
2672}
2673
2674void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
2675{
2676 /*
2677 * looks as though older kernels can get into trouble with
2678 * this code, they end up stuck in balance_dirty_pages forever
2679 */
2680 u64 num_dirty;
2681 unsigned long thresh = 32 * 1024 * 1024;
2682
2683 if (current->flags & PF_MEMALLOC)
2684 return;
2685
2545 num_dirty = root->fs_info->dirty_metadata_bytes; 2686 num_dirty = root->fs_info->dirty_metadata_bytes;
2546 2687
2547 if (num_dirty > thresh) { 2688 if (num_dirty > thresh) {
@@ -2574,7 +2715,7 @@ int btree_lock_page_hook(struct page *page)
2574 goto out; 2715 goto out;
2575 2716
2576 len = page->private >> 2; 2717 len = page->private >> 2;
2577 eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); 2718 eb = find_extent_buffer(io_tree, bytenr, len);
2578 if (!eb) 2719 if (!eb)
2579 goto out; 2720 goto out;
2580 2721
@@ -2597,6 +2738,355 @@ out:
2597 return 0; 2738 return 0;
2598} 2739}
2599 2740
2741static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
2742 int read_only)
2743{
2744 if (read_only)
2745 return;
2746
2747 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2748 printk(KERN_WARNING "warning: mount fs with errors, "
2749 "running btrfsck is recommended\n");
2750}
2751
2752int btrfs_error_commit_super(struct btrfs_root *root)
2753{
2754 int ret;
2755
2756 mutex_lock(&root->fs_info->cleaner_mutex);
2757 btrfs_run_delayed_iputs(root);
2758 mutex_unlock(&root->fs_info->cleaner_mutex);
2759
2760 down_write(&root->fs_info->cleanup_work_sem);
2761 up_write(&root->fs_info->cleanup_work_sem);
2762
2763 /* cleanup FS via transaction */
2764 btrfs_cleanup_transaction(root);
2765
2766 ret = write_ctree_super(NULL, root, 0);
2767
2768 return ret;
2769}
2770
2771static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
2772{
2773 struct btrfs_inode *btrfs_inode;
2774 struct list_head splice;
2775
2776 INIT_LIST_HEAD(&splice);
2777
2778 mutex_lock(&root->fs_info->ordered_operations_mutex);
2779 spin_lock(&root->fs_info->ordered_extent_lock);
2780
2781 list_splice_init(&root->fs_info->ordered_operations, &splice);
2782 while (!list_empty(&splice)) {
2783 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2784 ordered_operations);
2785
2786 list_del_init(&btrfs_inode->ordered_operations);
2787
2788 btrfs_invalidate_inodes(btrfs_inode->root);
2789 }
2790
2791 spin_unlock(&root->fs_info->ordered_extent_lock);
2792 mutex_unlock(&root->fs_info->ordered_operations_mutex);
2793
2794 return 0;
2795}
2796
2797static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
2798{
2799 struct list_head splice;
2800 struct btrfs_ordered_extent *ordered;
2801 struct inode *inode;
2802
2803 INIT_LIST_HEAD(&splice);
2804
2805 spin_lock(&root->fs_info->ordered_extent_lock);
2806
2807 list_splice_init(&root->fs_info->ordered_extents, &splice);
2808 while (!list_empty(&splice)) {
2809 ordered = list_entry(splice.next, struct btrfs_ordered_extent,
2810 root_extent_list);
2811
2812 list_del_init(&ordered->root_extent_list);
2813 atomic_inc(&ordered->refs);
2814
2815 /* the inode may be getting freed (in sys_unlink path). */
2816 inode = igrab(ordered->inode);
2817
2818 spin_unlock(&root->fs_info->ordered_extent_lock);
2819 if (inode)
2820 iput(inode);
2821
2822 atomic_set(&ordered->refs, 1);
2823 btrfs_put_ordered_extent(ordered);
2824
2825 spin_lock(&root->fs_info->ordered_extent_lock);
2826 }
2827
2828 spin_unlock(&root->fs_info->ordered_extent_lock);
2829
2830 return 0;
2831}
2832
2833static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
2834 struct btrfs_root *root)
2835{
2836 struct rb_node *node;
2837 struct btrfs_delayed_ref_root *delayed_refs;
2838 struct btrfs_delayed_ref_node *ref;
2839 int ret = 0;
2840
2841 delayed_refs = &trans->delayed_refs;
2842
2843 spin_lock(&delayed_refs->lock);
2844 if (delayed_refs->num_entries == 0) {
2845 spin_unlock(&delayed_refs->lock);
2846 printk(KERN_INFO "delayed_refs has NO entry\n");
2847 return ret;
2848 }
2849
2850 node = rb_first(&delayed_refs->root);
2851 while (node) {
2852 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2853 node = rb_next(node);
2854
2855 ref->in_tree = 0;
2856 rb_erase(&ref->rb_node, &delayed_refs->root);
2857 delayed_refs->num_entries--;
2858
2859 atomic_set(&ref->refs, 1);
2860 if (btrfs_delayed_ref_is_head(ref)) {
2861 struct btrfs_delayed_ref_head *head;
2862
2863 head = btrfs_delayed_node_to_head(ref);
2864 mutex_lock(&head->mutex);
2865 kfree(head->extent_op);
2866 delayed_refs->num_heads--;
2867 if (list_empty(&head->cluster))
2868 delayed_refs->num_heads_ready--;
2869 list_del_init(&head->cluster);
2870 mutex_unlock(&head->mutex);
2871 }
2872
2873 spin_unlock(&delayed_refs->lock);
2874 btrfs_put_delayed_ref(ref);
2875
2876 cond_resched();
2877 spin_lock(&delayed_refs->lock);
2878 }
2879
2880 spin_unlock(&delayed_refs->lock);
2881
2882 return ret;
2883}
2884
2885static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
2886{
2887 struct btrfs_pending_snapshot *snapshot;
2888 struct list_head splice;
2889
2890 INIT_LIST_HEAD(&splice);
2891
2892 list_splice_init(&t->pending_snapshots, &splice);
2893
2894 while (!list_empty(&splice)) {
2895 snapshot = list_entry(splice.next,
2896 struct btrfs_pending_snapshot,
2897 list);
2898
2899 list_del_init(&snapshot->list);
2900
2901 kfree(snapshot);
2902 }
2903
2904 return 0;
2905}
2906
2907static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
2908{
2909 struct btrfs_inode *btrfs_inode;
2910 struct list_head splice;
2911
2912 INIT_LIST_HEAD(&splice);
2913
2914 spin_lock(&root->fs_info->delalloc_lock);
2915 list_splice_init(&root->fs_info->delalloc_inodes, &splice);
2916
2917 while (!list_empty(&splice)) {
2918 btrfs_inode = list_entry(splice.next, struct btrfs_inode,
2919 delalloc_inodes);
2920
2921 list_del_init(&btrfs_inode->delalloc_inodes);
2922
2923 btrfs_invalidate_inodes(btrfs_inode->root);
2924 }
2925
2926 spin_unlock(&root->fs_info->delalloc_lock);
2927
2928 return 0;
2929}
2930
2931static int btrfs_destroy_marked_extents(struct btrfs_root *root,
2932 struct extent_io_tree *dirty_pages,
2933 int mark)
2934{
2935 int ret;
2936 struct page *page;
2937 struct inode *btree_inode = root->fs_info->btree_inode;
2938 struct extent_buffer *eb;
2939 u64 start = 0;
2940 u64 end;
2941 u64 offset;
2942 unsigned long index;
2943
2944 while (1) {
2945 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
2946 mark);
2947 if (ret)
2948 break;
2949
2950 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
2951 while (start <= end) {
2952 index = start >> PAGE_CACHE_SHIFT;
2953 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
2954 page = find_get_page(btree_inode->i_mapping, index);
2955 if (!page)
2956 continue;
2957 offset = page_offset(page);
2958
2959 spin_lock(&dirty_pages->buffer_lock);
2960 eb = radix_tree_lookup(
2961 &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
2962 offset >> PAGE_CACHE_SHIFT);
2963 spin_unlock(&dirty_pages->buffer_lock);
2964 if (eb) {
2965 ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
2966 &eb->bflags);
2967 atomic_set(&eb->refs, 1);
2968 }
2969 if (PageWriteback(page))
2970 end_page_writeback(page);
2971
2972 lock_page(page);
2973 if (PageDirty(page)) {
2974 clear_page_dirty_for_io(page);
2975 spin_lock_irq(&page->mapping->tree_lock);
2976 radix_tree_tag_clear(&page->mapping->page_tree,
2977 page_index(page),
2978 PAGECACHE_TAG_DIRTY);
2979 spin_unlock_irq(&page->mapping->tree_lock);
2980 }
2981
2982 page->mapping->a_ops->invalidatepage(page, 0);
2983 unlock_page(page);
2984 }
2985 }
2986
2987 return ret;
2988}
2989
2990static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
2991 struct extent_io_tree *pinned_extents)
2992{
2993 struct extent_io_tree *unpin;
2994 u64 start;
2995 u64 end;
2996 int ret;
2997
2998 unpin = pinned_extents;
2999 while (1) {
3000 ret = find_first_extent_bit(unpin, 0, &start, &end,
3001 EXTENT_DIRTY);
3002 if (ret)
3003 break;
3004
3005 /* opt_discard */
3006 if (btrfs_test_opt(root, DISCARD))
3007 ret = btrfs_error_discard_extent(root, start,
3008 end + 1 - start,
3009 NULL);
3010
3011 clear_extent_dirty(unpin, start, end, GFP_NOFS);
3012 btrfs_error_unpin_extent_range(root, start, end);
3013 cond_resched();
3014 }
3015
3016 return 0;
3017}
3018
3019static int btrfs_cleanup_transaction(struct btrfs_root *root)
3020{
3021 struct btrfs_transaction *t;
3022 LIST_HEAD(list);
3023
3024 WARN_ON(1);
3025
3026 mutex_lock(&root->fs_info->transaction_kthread_mutex);
3027
3028 spin_lock(&root->fs_info->trans_lock);
3029 list_splice_init(&root->fs_info->trans_list, &list);
3030 root->fs_info->trans_no_join = 1;
3031 spin_unlock(&root->fs_info->trans_lock);
3032
3033 while (!list_empty(&list)) {
3034 t = list_entry(list.next, struct btrfs_transaction, list);
3035 if (!t)
3036 break;
3037
3038 btrfs_destroy_ordered_operations(root);
3039
3040 btrfs_destroy_ordered_extents(root);
3041
3042 btrfs_destroy_delayed_refs(t, root);
3043
3044 btrfs_block_rsv_release(root,
3045 &root->fs_info->trans_block_rsv,
3046 t->dirty_pages.dirty_bytes);
3047
3048 /* FIXME: cleanup wait for commit */
3049 t->in_commit = 1;
3050 t->blocked = 1;
3051 if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
3052 wake_up(&root->fs_info->transaction_blocked_wait);
3053
3054 t->blocked = 0;
3055 if (waitqueue_active(&root->fs_info->transaction_wait))
3056 wake_up(&root->fs_info->transaction_wait);
3057
3058 t->commit_done = 1;
3059 if (waitqueue_active(&t->commit_wait))
3060 wake_up(&t->commit_wait);
3061
3062 btrfs_destroy_pending_snapshots(t);
3063
3064 btrfs_destroy_delalloc_inodes(root);
3065
3066 spin_lock(&root->fs_info->trans_lock);
3067 root->fs_info->running_transaction = NULL;
3068 spin_unlock(&root->fs_info->trans_lock);
3069
3070 btrfs_destroy_marked_extents(root, &t->dirty_pages,
3071 EXTENT_DIRTY);
3072
3073 btrfs_destroy_pinned_extent(root,
3074 root->fs_info->pinned_extents);
3075
3076 atomic_set(&t->use_count, 0);
3077 list_del_init(&t->list);
3078 memset(t, 0, sizeof(*t));
3079 kmem_cache_free(btrfs_transaction_cachep, t);
3080 }
3081
3082 spin_lock(&root->fs_info->trans_lock);
3083 root->fs_info->trans_no_join = 0;
3084 spin_unlock(&root->fs_info->trans_lock);
3085 mutex_unlock(&root->fs_info->transaction_kthread_mutex);
3086
3087 return 0;
3088}
3089
2600static struct extent_io_ops btree_extent_io_ops = { 3090static struct extent_io_ops btree_extent_io_ops = {
2601 .write_cache_pages_lock_hook = btree_lock_page_hook, 3091 .write_cache_pages_lock_hook = btree_lock_page_hook,
2602 .readpage_end_io_hook = btree_readpage_end_io_hook, 3092 .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..a0b610a67aae 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,37 +52,23 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
52 struct btrfs_root *root, int max_mirrors); 52 struct btrfs_root *root, int max_mirrors);
53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); 53struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
54int btrfs_commit_super(struct btrfs_root *root); 54int btrfs_commit_super(struct btrfs_root *root);
55int btrfs_error_commit_super(struct btrfs_root *root);
55struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 56struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
56 u64 bytenr, u32 blocksize); 57 u64 bytenr, u32 blocksize);
57struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
58 u64 root_objectid);
59struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
60 struct btrfs_key *location,
61 const char *name, int namelen);
62struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, 58struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
63 struct btrfs_key *location); 59 struct btrfs_key *location);
64struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, 60struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
65 struct btrfs_key *location); 61 struct btrfs_key *location);
66int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); 62int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
67int btrfs_insert_dev_radix(struct btrfs_root *root,
68 struct block_device *bdev,
69 u64 device_id,
70 u64 block_start,
71 u64 num_blocks);
72void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); 63void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
64void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
73int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); 65int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
74void btrfs_mark_buffer_dirty(struct extent_buffer *buf); 66void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
75void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
76int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); 67int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
77int btrfs_set_buffer_uptodate(struct extent_buffer *buf); 68int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
78int wait_on_tree_block_writeback(struct btrfs_root *root,
79 struct extent_buffer *buf);
80int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); 69int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
81u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); 70u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
82void btrfs_csum_final(u32 crc, char *result); 71void btrfs_csum_final(u32 crc, char *result);
83int btrfs_open_device(struct btrfs_device *dev);
84int btrfs_verify_block_csum(struct btrfs_root *root,
85 struct extent_buffer *buf);
86int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, 72int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
87 int metadata); 73 int metadata);
88int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, 74int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
@@ -90,8 +76,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
90 unsigned long bio_flags, u64 bio_offset, 76 unsigned long bio_flags, u64 bio_offset,
91 extent_submit_bio_hook_t *submit_bio_start, 77 extent_submit_bio_hook_t *submit_bio_start,
92 extent_submit_bio_hook_t *submit_bio_done); 78 extent_submit_bio_hook_t *submit_bio_done);
93
94int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
95unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); 79unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
96int btrfs_write_tree_block(struct extent_buffer *buf); 80int btrfs_write_tree_block(struct extent_buffer *buf);
97int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); 81int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f4..1b8dc33778f9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,14 +21,18 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
21 int len = *max_len; 21 int len = *max_len;
22 int type; 22 int type;
23 23
24 if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || 24 if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
25 (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) 25 *max_len = BTRFS_FID_SIZE_CONNECTABLE;
26 return 255; 26 return 255;
27 } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
28 *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 return 255;
30 }
27 31
28 len = BTRFS_FID_SIZE_NON_CONNECTABLE; 32 len = BTRFS_FID_SIZE_NON_CONNECTABLE;
29 type = FILEID_BTRFS_WITHOUT_PARENT; 33 type = FILEID_BTRFS_WITHOUT_PARENT;
30 34
31 fid->objectid = inode->i_ino; 35 fid->objectid = btrfs_ino(inode);
32 fid->root_objectid = BTRFS_I(inode)->root->objectid; 36 fid->root_objectid = BTRFS_I(inode)->root->objectid;
33 fid->gen = inode->i_generation; 37 fid->gen = inode->i_generation;
34 38
@@ -65,7 +69,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
65{ 69{
66 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info; 70 struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
67 struct btrfs_root *root; 71 struct btrfs_root *root;
68 struct dentry *dentry;
69 struct inode *inode; 72 struct inode *inode;
70 struct btrfs_key key; 73 struct btrfs_key key;
71 int index; 74 int index;
@@ -108,10 +111,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
108 return ERR_PTR(-ESTALE); 111 return ERR_PTR(-ESTALE);
109 } 112 }
110 113
111 dentry = d_obtain_alias(inode); 114 return d_obtain_alias(inode);
112 if (!IS_ERR(dentry))
113 dentry->d_op = &btrfs_dentry_operations;
114 return dentry;
115fail: 115fail:
116 srcu_read_unlock(&fs_info->subvol_srcu, index); 116 srcu_read_unlock(&fs_info->subvol_srcu, index);
117 return ERR_PTR(err); 117 return ERR_PTR(err);
@@ -166,7 +166,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
166static struct dentry *btrfs_get_parent(struct dentry *child) 166static struct dentry *btrfs_get_parent(struct dentry *child)
167{ 167{
168 struct inode *dir = child->d_inode; 168 struct inode *dir = child->d_inode;
169 static struct dentry *dentry;
170 struct btrfs_root *root = BTRFS_I(dir)->root; 169 struct btrfs_root *root = BTRFS_I(dir)->root;
171 struct btrfs_path *path; 170 struct btrfs_path *path;
172 struct extent_buffer *leaf; 171 struct extent_buffer *leaf;
@@ -176,14 +175,16 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
176 int ret; 175 int ret;
177 176
178 path = btrfs_alloc_path(); 177 path = btrfs_alloc_path();
178 if (!path)
179 return ERR_PTR(-ENOMEM);
179 180
180 if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) { 181 if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
181 key.objectid = root->root_key.objectid; 182 key.objectid = root->root_key.objectid;
182 key.type = BTRFS_ROOT_BACKREF_KEY; 183 key.type = BTRFS_ROOT_BACKREF_KEY;
183 key.offset = (u64)-1; 184 key.offset = (u64)-1;
184 root = root->fs_info->tree_root; 185 root = root->fs_info->tree_root;
185 } else { 186 } else {
186 key.objectid = dir->i_ino; 187 key.objectid = btrfs_ino(dir);
187 key.type = BTRFS_INODE_REF_KEY; 188 key.type = BTRFS_INODE_REF_KEY;
188 key.offset = (u64)-1; 189 key.offset = (u64)-1;
189 } 190 }
@@ -223,18 +224,94 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 224
224 key.type = BTRFS_INODE_ITEM_KEY; 225 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 226 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); 227 return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry;
230fail: 228fail:
231 btrfs_free_path(path); 229 btrfs_free_path(path);
232 return ERR_PTR(ret); 230 return ERR_PTR(ret);
233} 231}
234 232
233static int btrfs_get_name(struct dentry *parent, char *name,
234 struct dentry *child)
235{
236 struct inode *inode = child->d_inode;
237 struct inode *dir = parent->d_inode;
238 struct btrfs_path *path;
239 struct btrfs_root *root = BTRFS_I(dir)->root;
240 struct btrfs_inode_ref *iref;
241 struct btrfs_root_ref *rref;
242 struct extent_buffer *leaf;
243 unsigned long name_ptr;
244 struct btrfs_key key;
245 int name_len;
246 int ret;
247 u64 ino;
248
249 if (!dir || !inode)
250 return -EINVAL;
251
252 if (!S_ISDIR(dir->i_mode))
253 return -EINVAL;
254
255 ino = btrfs_ino(inode);
256
257 path = btrfs_alloc_path();
258 if (!path)
259 return -ENOMEM;
260 path->leave_spinning = 1;
261
262 if (ino == BTRFS_FIRST_FREE_OBJECTID) {
263 key.objectid = BTRFS_I(inode)->root->root_key.objectid;
264 key.type = BTRFS_ROOT_BACKREF_KEY;
265 key.offset = (u64)-1;
266 root = root->fs_info->tree_root;
267 } else {
268 key.objectid = ino;
269 key.offset = btrfs_ino(dir);
270 key.type = BTRFS_INODE_REF_KEY;
271 }
272
273 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
274 if (ret < 0) {
275 btrfs_free_path(path);
276 return ret;
277 } else if (ret > 0) {
278 if (ino == BTRFS_FIRST_FREE_OBJECTID) {
279 path->slots[0]--;
280 } else {
281 btrfs_free_path(path);
282 return -ENOENT;
283 }
284 }
285 leaf = path->nodes[0];
286
287 if (ino == BTRFS_FIRST_FREE_OBJECTID) {
288 rref = btrfs_item_ptr(leaf, path->slots[0],
289 struct btrfs_root_ref);
290 name_ptr = (unsigned long)(rref + 1);
291 name_len = btrfs_root_ref_name_len(leaf, rref);
292 } else {
293 iref = btrfs_item_ptr(leaf, path->slots[0],
294 struct btrfs_inode_ref);
295 name_ptr = (unsigned long)(iref + 1);
296 name_len = btrfs_inode_ref_name_len(leaf, iref);
297 }
298
299 read_extent_buffer(leaf, name, name_ptr, name_len);
300 btrfs_free_path(path);
301
302 /*
303 * have to add the null termination to make sure that reconnect_path
304 * gets the right len for strlen
305 */
306 name[name_len] = '\0';
307
308 return 0;
309}
310
235const struct export_operations btrfs_export_ops = { 311const struct export_operations btrfs_export_ops = {
236 .encode_fh = btrfs_encode_fh, 312 .encode_fh = btrfs_encode_fh,
237 .fh_to_dentry = btrfs_fh_to_dentry, 313 .fh_to_dentry = btrfs_fh_to_dentry,
238 .fh_to_parent = btrfs_fh_to_parent, 314 .fh_to_parent = btrfs_fh_to_parent,
239 .get_parent = btrfs_get_parent, 315 .get_parent = btrfs_get_parent,
316 .get_name = btrfs_get_name,
240}; 317};
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..71cd456fdb60 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,11 +33,28 @@
33#include "locking.h" 33#include "locking.h"
34#include "free-space-cache.h" 34#include "free-space-cache.h"
35 35
36/* control flags for do_chunk_alloc's force field
37 * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
38 * if we really need one.
39 *
40 * CHUNK_ALLOC_FORCE means it must try to allocate one
41 *
42 * CHUNK_ALLOC_LIMITED means to only try and allocate one
43 * if we have very few chunks already allocated. This is
44 * used as part of the clustering code to help make sure
45 * we have a good pool of storage to cluster in, without
46 * filling the FS with empty chunks
47 *
48 */
49enum {
50 CHUNK_ALLOC_NO_FORCE = 0,
51 CHUNK_ALLOC_FORCE = 1,
52 CHUNK_ALLOC_LIMITED = 2,
53};
54
36static int update_block_group(struct btrfs_trans_handle *trans, 55static int update_block_group(struct btrfs_trans_handle *trans,
37 struct btrfs_root *root, 56 struct btrfs_root *root,
38 u64 bytenr, u64 num_bytes, int alloc); 57 u64 bytenr, u64 num_bytes, int alloc);
39static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
40 u64 num_bytes, int reserve, int sinfo);
41static int __btrfs_free_extent(struct btrfs_trans_handle *trans, 58static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 59 struct btrfs_root *root,
43 u64 bytenr, u64 num_bytes, u64 parent, 60 u64 bytenr, u64 num_bytes, u64 parent,
@@ -77,7 +94,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
77 return (cache->flags & bits) == bits; 94 return (cache->flags & bits) == bits;
78} 95}
79 96
80void btrfs_get_block_group(struct btrfs_block_group_cache *cache) 97static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
81{ 98{
82 atomic_inc(&cache->count); 99 atomic_inc(&cache->count);
83} 100}
@@ -88,6 +105,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
88 WARN_ON(cache->pinned > 0); 105 WARN_ON(cache->pinned > 0);
89 WARN_ON(cache->reserved > 0); 106 WARN_ON(cache->reserved > 0);
90 WARN_ON(cache->reserved_pinned > 0); 107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl);
91 kfree(cache); 109 kfree(cache);
92 } 110 }
93} 111}
@@ -242,6 +260,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
242 return NULL; 260 return NULL;
243 } 261 }
244 262
263 /* We're loading it the fast way, so we don't have a caching_ctl. */
264 if (!cache->caching_ctl) {
265 spin_unlock(&cache->lock);
266 return NULL;
267 }
268
245 ctl = cache->caching_ctl; 269 ctl = cache->caching_ctl;
246 atomic_inc(&ctl->count); 270 atomic_inc(&ctl->count);
247 spin_unlock(&cache->lock); 271 spin_unlock(&cache->lock);
@@ -314,11 +338,6 @@ static int caching_kthread(void *data)
314 if (!path) 338 if (!path)
315 return -ENOMEM; 339 return -ENOMEM;
316 340
317 exclude_super_stripes(extent_root, block_group);
318 spin_lock(&block_group->space_info->lock);
319 block_group->space_info->bytes_readonly += block_group->bytes_super;
320 spin_unlock(&block_group->space_info->lock);
321
322 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); 341 last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
323 342
324 /* 343 /*
@@ -329,7 +348,7 @@ static int caching_kthread(void *data)
329 */ 348 */
330 path->skip_locking = 1; 349 path->skip_locking = 1;
331 path->search_commit_root = 1; 350 path->search_commit_root = 1;
332 path->reada = 2; 351 path->reada = 1;
333 352
334 key.objectid = last; 353 key.objectid = last;
335 key.offset = 0; 354 key.offset = 0;
@@ -347,8 +366,7 @@ again:
347 nritems = btrfs_header_nritems(leaf); 366 nritems = btrfs_header_nritems(leaf);
348 367
349 while (1) { 368 while (1) {
350 smp_mb(); 369 if (btrfs_fs_closing(fs_info) > 1) {
351 if (fs_info->closing > 1) {
352 last = (u64)-1; 370 last = (u64)-1;
353 break; 371 break;
354 } 372 }
@@ -360,15 +378,18 @@ again:
360 if (ret) 378 if (ret)
361 break; 379 break;
362 380
363 caching_ctl->progress = last; 381 if (need_resched() ||
364 btrfs_release_path(extent_root, path); 382 btrfs_next_leaf(extent_root, path)) {
365 up_read(&fs_info->extent_commit_sem); 383 caching_ctl->progress = last;
366 mutex_unlock(&caching_ctl->mutex); 384 btrfs_release_path(path);
367 if (btrfs_transaction_in_commit(fs_info)) 385 up_read(&fs_info->extent_commit_sem);
368 schedule_timeout(1); 386 mutex_unlock(&caching_ctl->mutex);
369 else
370 cond_resched(); 387 cond_resched();
371 goto again; 388 goto again;
389 }
390 leaf = path->nodes[0];
391 nritems = btrfs_header_nritems(leaf);
392 continue;
372 } 393 }
373 394
374 if (key.objectid < block_group->key.objectid) { 395 if (key.objectid < block_group->key.objectid) {
@@ -421,7 +442,10 @@ err:
421 return 0; 442 return 0;
422} 443}
423 444
424static int cache_block_group(struct btrfs_block_group_cache *cache) 445static int cache_block_group(struct btrfs_block_group_cache *cache,
446 struct btrfs_trans_handle *trans,
447 struct btrfs_root *root,
448 int load_cache_only)
425{ 449{
426 struct btrfs_fs_info *fs_info = cache->fs_info; 450 struct btrfs_fs_info *fs_info = cache->fs_info;
427 struct btrfs_caching_control *caching_ctl; 451 struct btrfs_caching_control *caching_ctl;
@@ -432,7 +456,42 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
432 if (cache->cached != BTRFS_CACHE_NO) 456 if (cache->cached != BTRFS_CACHE_NO)
433 return 0; 457 return 0;
434 458
435 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL); 459 /*
460 * We can't do the read from on-disk cache during a commit since we need
461 * to have the normal tree locking. Also if we are currently trying to
462 * allocate blocks for the tree root we can't do the fast caching since
463 * we likely hold important locks.
464 */
465 if (trans && (!trans->transaction->in_commit) &&
466 (root && root != root->fs_info->tree_root)) {
467 spin_lock(&cache->lock);
468 if (cache->cached != BTRFS_CACHE_NO) {
469 spin_unlock(&cache->lock);
470 return 0;
471 }
472 cache->cached = BTRFS_CACHE_STARTED;
473 spin_unlock(&cache->lock);
474
475 ret = load_free_space_cache(fs_info, cache);
476
477 spin_lock(&cache->lock);
478 if (ret == 1) {
479 cache->cached = BTRFS_CACHE_FINISHED;
480 cache->last_byte_to_unpin = (u64)-1;
481 } else {
482 cache->cached = BTRFS_CACHE_NO;
483 }
484 spin_unlock(&cache->lock);
485 if (ret == 1) {
486 free_excluded_extents(fs_info->extent_root, cache);
487 return 0;
488 }
489 }
490
491 if (load_cache_only)
492 return 0;
493
494 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
436 BUG_ON(!caching_ctl); 495 BUG_ON(!caching_ctl);
437 496
438 INIT_LIST_HEAD(&caching_ctl->list); 497 INIT_LIST_HEAD(&caching_ctl->list);
@@ -509,7 +568,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
509 568
510 rcu_read_lock(); 569 rcu_read_lock();
511 list_for_each_entry_rcu(found, head, list) { 570 list_for_each_entry_rcu(found, head, list) {
512 if (found->flags == flags) { 571 if (found->flags & flags) {
513 rcu_read_unlock(); 572 rcu_read_unlock();
514 return found; 573 return found;
515 } 574 }
@@ -542,6 +601,15 @@ static u64 div_factor(u64 num, int factor)
542 return num; 601 return num;
543} 602}
544 603
604static u64 div_factor_fine(u64 num, int factor)
605{
606 if (factor == 100)
607 return num;
608 num *= factor;
609 do_div(num, 100);
610 return num;
611}
612
545u64 btrfs_find_block_group(struct btrfs_root *root, 613u64 btrfs_find_block_group(struct btrfs_root *root,
546 u64 search_start, u64 search_hint, int owner) 614 u64 search_start, u64 search_hint, int owner)
547{ 615{
@@ -689,8 +757,12 @@ again:
689 atomic_inc(&head->node.refs); 757 atomic_inc(&head->node.refs);
690 spin_unlock(&delayed_refs->lock); 758 spin_unlock(&delayed_refs->lock);
691 759
692 btrfs_release_path(root->fs_info->extent_root, path); 760 btrfs_release_path(path);
693 761
762 /*
763 * Mutex was contended, block until it's released and try
764 * again
765 */
694 mutex_lock(&head->mutex); 766 mutex_lock(&head->mutex);
695 mutex_unlock(&head->mutex); 767 mutex_unlock(&head->mutex);
696 btrfs_put_delayed_ref(&head->node); 768 btrfs_put_delayed_ref(&head->node);
@@ -869,7 +941,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
869 break; 941 break;
870 } 942 }
871 } 943 }
872 btrfs_release_path(root, path); 944 btrfs_release_path(path);
873 945
874 if (owner < BTRFS_FIRST_FREE_OBJECTID) 946 if (owner < BTRFS_FIRST_FREE_OBJECTID)
875 new_size += sizeof(*bi); 947 new_size += sizeof(*bi);
@@ -882,7 +954,6 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
882 BUG_ON(ret); 954 BUG_ON(ret);
883 955
884 ret = btrfs_extend_item(trans, root, path, new_size); 956 ret = btrfs_extend_item(trans, root, path, new_size);
885 BUG_ON(ret);
886 957
887 leaf = path->nodes[0]; 958 leaf = path->nodes[0];
888 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 959 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -977,7 +1048,7 @@ again:
977 return 0; 1048 return 0;
978#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1049#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
979 key.type = BTRFS_EXTENT_REF_V0_KEY; 1050 key.type = BTRFS_EXTENT_REF_V0_KEY;
980 btrfs_release_path(root, path); 1051 btrfs_release_path(path);
981 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1052 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
982 if (ret < 0) { 1053 if (ret < 0) {
983 err = ret; 1054 err = ret;
@@ -1015,7 +1086,7 @@ again:
1015 if (match_extent_data_ref(leaf, ref, root_objectid, 1086 if (match_extent_data_ref(leaf, ref, root_objectid,
1016 owner, offset)) { 1087 owner, offset)) {
1017 if (recow) { 1088 if (recow) {
1018 btrfs_release_path(root, path); 1089 btrfs_release_path(path);
1019 goto again; 1090 goto again;
1020 } 1091 }
1021 err = 0; 1092 err = 0;
@@ -1076,7 +1147,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1076 if (match_extent_data_ref(leaf, ref, root_objectid, 1147 if (match_extent_data_ref(leaf, ref, root_objectid,
1077 owner, offset)) 1148 owner, offset))
1078 break; 1149 break;
1079 btrfs_release_path(root, path); 1150 btrfs_release_path(path);
1080 key.offset++; 1151 key.offset++;
1081 ret = btrfs_insert_empty_item(trans, root, path, &key, 1152 ret = btrfs_insert_empty_item(trans, root, path, &key,
1082 size); 1153 size);
@@ -1102,7 +1173,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1102 btrfs_mark_buffer_dirty(leaf); 1173 btrfs_mark_buffer_dirty(leaf);
1103 ret = 0; 1174 ret = 0;
1104fail: 1175fail:
1105 btrfs_release_path(root, path); 1176 btrfs_release_path(path);
1106 return ret; 1177 return ret;
1107} 1178}
1108 1179
@@ -1228,7 +1299,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1228 ret = -ENOENT; 1299 ret = -ENOENT;
1229#ifdef BTRFS_COMPAT_EXTENT_TREE_V0 1300#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1230 if (ret == -ENOENT && parent) { 1301 if (ret == -ENOENT && parent) {
1231 btrfs_release_path(root, path); 1302 btrfs_release_path(path);
1232 key.type = BTRFS_EXTENT_REF_V0_KEY; 1303 key.type = BTRFS_EXTENT_REF_V0_KEY;
1233 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1304 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1234 if (ret > 0) 1305 if (ret > 0)
@@ -1257,7 +1328,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1257 } 1328 }
1258 1329
1259 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1330 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1260 btrfs_release_path(root, path); 1331 btrfs_release_path(path);
1261 return ret; 1332 return ret;
1262} 1333}
1263 1334
@@ -1490,7 +1561,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1490 size = btrfs_extent_inline_ref_size(type); 1561 size = btrfs_extent_inline_ref_size(type);
1491 1562
1492 ret = btrfs_extend_item(trans, root, path, size); 1563 ret = btrfs_extend_item(trans, root, path, size);
1493 BUG_ON(ret);
1494 1564
1495 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); 1565 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1496 refs = btrfs_extent_refs(leaf, ei); 1566 refs = btrfs_extent_refs(leaf, ei);
@@ -1543,7 +1613,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1543 if (ret != -ENOENT) 1613 if (ret != -ENOENT)
1544 return ret; 1614 return ret;
1545 1615
1546 btrfs_release_path(root, path); 1616 btrfs_release_path(path);
1547 *ref_ret = NULL; 1617 *ref_ret = NULL;
1548 1618
1549 if (owner < BTRFS_FIRST_FREE_OBJECTID) { 1619 if (owner < BTRFS_FIRST_FREE_OBJECTID) {
@@ -1619,7 +1689,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
1619 end - ptr - size); 1689 end - ptr - size);
1620 item_size -= size; 1690 item_size -= size;
1621 ret = btrfs_truncate_item(trans, root, path, item_size, 1); 1691 ret = btrfs_truncate_item(trans, root, path, item_size, 1);
1622 BUG_ON(ret);
1623 } 1692 }
1624 btrfs_mark_buffer_dirty(leaf); 1693 btrfs_mark_buffer_dirty(leaf);
1625 return 0; 1694 return 0;
@@ -1692,40 +1761,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
1692 return ret; 1761 return ret;
1693} 1762}
1694 1763
1695static void btrfs_issue_discard(struct block_device *bdev, 1764static int btrfs_issue_discard(struct block_device *bdev,
1696 u64 start, u64 len) 1765 u64 start, u64 len)
1697{ 1766{
1698 blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 1767 return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1699 BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
1700} 1768}
1701 1769
1702static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, 1770static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1703 u64 num_bytes) 1771 u64 num_bytes, u64 *actual_bytes)
1704{ 1772{
1705 int ret; 1773 int ret;
1706 u64 map_length = num_bytes; 1774 u64 discarded_bytes = 0;
1707 struct btrfs_multi_bio *multi = NULL; 1775 struct btrfs_multi_bio *multi = NULL;
1708 1776
1709 if (!btrfs_test_opt(root, DISCARD))
1710 return 0;
1711 1777
1712 /* Tell the block device(s) that the sectors can be discarded */ 1778 /* Tell the block device(s) that the sectors can be discarded */
1713 ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, 1779 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1714 bytenr, &map_length, &multi, 0); 1780 bytenr, &num_bytes, &multi, 0);
1715 if (!ret) { 1781 if (!ret) {
1716 struct btrfs_bio_stripe *stripe = multi->stripes; 1782 struct btrfs_bio_stripe *stripe = multi->stripes;
1717 int i; 1783 int i;
1718 1784
1719 if (map_length > num_bytes)
1720 map_length = num_bytes;
1721 1785
1722 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1786 for (i = 0; i < multi->num_stripes; i++, stripe++) {
1723 btrfs_issue_discard(stripe->dev->bdev, 1787 ret = btrfs_issue_discard(stripe->dev->bdev,
1724 stripe->physical, 1788 stripe->physical,
1725 map_length); 1789 stripe->length);
1790 if (!ret)
1791 discarded_bytes += stripe->length;
1792 else if (ret != -EOPNOTSUPP)
1793 break;
1726 } 1794 }
1727 kfree(multi); 1795 kfree(multi);
1728 } 1796 }
1797 if (discarded_bytes && ret == -EOPNOTSUPP)
1798 ret = 0;
1799
1800 if (actual_bytes)
1801 *actual_bytes = discarded_bytes;
1802
1729 1803
1730 return ret; 1804 return ret;
1731} 1805}
@@ -1792,7 +1866,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1792 __run_delayed_extent_op(extent_op, leaf, item); 1866 __run_delayed_extent_op(extent_op, leaf, item);
1793 1867
1794 btrfs_mark_buffer_dirty(leaf); 1868 btrfs_mark_buffer_dirty(leaf);
1795 btrfs_release_path(root->fs_info->extent_root, path); 1869 btrfs_release_path(path);
1796 1870
1797 path->reada = 1; 1871 path->reada = 1;
1798 path->leave_spinning = 1; 1872 path->leave_spinning = 1;
@@ -2227,6 +2301,10 @@ again:
2227 atomic_inc(&ref->refs); 2301 atomic_inc(&ref->refs);
2228 2302
2229 spin_unlock(&delayed_refs->lock); 2303 spin_unlock(&delayed_refs->lock);
2304 /*
2305 * Mutex was contended, block until it's
2306 * released and try again
2307 */
2230 mutex_lock(&head->mutex); 2308 mutex_lock(&head->mutex);
2231 mutex_unlock(&head->mutex); 2309 mutex_unlock(&head->mutex);
2232 2310
@@ -2291,8 +2369,12 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2291 atomic_inc(&head->node.refs); 2369 atomic_inc(&head->node.refs);
2292 spin_unlock(&delayed_refs->lock); 2370 spin_unlock(&delayed_refs->lock);
2293 2371
2294 btrfs_release_path(root->fs_info->extent_root, path); 2372 btrfs_release_path(path);
2295 2373
2374 /*
2375 * Mutex was contended, block until it's released and let
2376 * caller try again
2377 */
2296 mutex_lock(&head->mutex); 2378 mutex_lock(&head->mutex);
2297 mutex_unlock(&head->mutex); 2379 mutex_unlock(&head->mutex);
2298 btrfs_put_delayed_ref(&head->node); 2380 btrfs_put_delayed_ref(&head->node);
@@ -2440,126 +2522,6 @@ out:
2440 return ret; 2522 return ret;
2441} 2523}
2442 2524
2443#if 0
2444int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2445 struct extent_buffer *buf, u32 nr_extents)
2446{
2447 struct btrfs_key key;
2448 struct btrfs_file_extent_item *fi;
2449 u64 root_gen;
2450 u32 nritems;
2451 int i;
2452 int level;
2453 int ret = 0;
2454 int shared = 0;
2455
2456 if (!root->ref_cows)
2457 return 0;
2458
2459 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
2460 shared = 0;
2461 root_gen = root->root_key.offset;
2462 } else {
2463 shared = 1;
2464 root_gen = trans->transid - 1;
2465 }
2466
2467 level = btrfs_header_level(buf);
2468 nritems = btrfs_header_nritems(buf);
2469
2470 if (level == 0) {
2471 struct btrfs_leaf_ref *ref;
2472 struct btrfs_extent_info *info;
2473
2474 ref = btrfs_alloc_leaf_ref(root, nr_extents);
2475 if (!ref) {
2476 ret = -ENOMEM;
2477 goto out;
2478 }
2479
2480 ref->root_gen = root_gen;
2481 ref->bytenr = buf->start;
2482 ref->owner = btrfs_header_owner(buf);
2483 ref->generation = btrfs_header_generation(buf);
2484 ref->nritems = nr_extents;
2485 info = ref->extents;
2486
2487 for (i = 0; nr_extents > 0 && i < nritems; i++) {
2488 u64 disk_bytenr;
2489 btrfs_item_key_to_cpu(buf, &key, i);
2490 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2491 continue;
2492 fi = btrfs_item_ptr(buf, i,
2493 struct btrfs_file_extent_item);
2494 if (btrfs_file_extent_type(buf, fi) ==
2495 BTRFS_FILE_EXTENT_INLINE)
2496 continue;
2497 disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2498 if (disk_bytenr == 0)
2499 continue;
2500
2501 info->bytenr = disk_bytenr;
2502 info->num_bytes =
2503 btrfs_file_extent_disk_num_bytes(buf, fi);
2504 info->objectid = key.objectid;
2505 info->offset = key.offset;
2506 info++;
2507 }
2508
2509 ret = btrfs_add_leaf_ref(root, ref, shared);
2510 if (ret == -EEXIST && shared) {
2511 struct btrfs_leaf_ref *old;
2512 old = btrfs_lookup_leaf_ref(root, ref->bytenr);
2513 BUG_ON(!old);
2514 btrfs_remove_leaf_ref(root, old);
2515 btrfs_free_leaf_ref(root, old);
2516 ret = btrfs_add_leaf_ref(root, ref, shared);
2517 }
2518 WARN_ON(ret);
2519 btrfs_free_leaf_ref(root, ref);
2520 }
2521out:
2522 return ret;
2523}
2524
2525/* when a block goes through cow, we update the reference counts of
2526 * everything that block points to. The internal pointers of the block
2527 * can be in just about any order, and it is likely to have clusters of
2528 * things that are close together and clusters of things that are not.
2529 *
2530 * To help reduce the seeks that come with updating all of these reference
2531 * counts, sort them by byte number before actual updates are done.
2532 *
2533 * struct refsort is used to match byte number to slot in the btree block.
2534 * we sort based on the byte number and then use the slot to actually
2535 * find the item.
2536 *
2537 * struct refsort is smaller than strcut btrfs_item and smaller than
2538 * struct btrfs_key_ptr. Since we're currently limited to the page size
2539 * for a btree block, there's no way for a kmalloc of refsorts for a
2540 * single node to be bigger than a page.
2541 */
2542struct refsort {
2543 u64 bytenr;
2544 u32 slot;
2545};
2546
2547/*
2548 * for passing into sort()
2549 */
2550static int refsort_cmp(const void *a_void, const void *b_void)
2551{
2552 const struct refsort *a = a_void;
2553 const struct refsort *b = b_void;
2554
2555 if (a->bytenr < b->bytenr)
2556 return -1;
2557 if (a->bytenr > b->bytenr)
2558 return 1;
2559 return 0;
2560}
2561#endif
2562
2563static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, 2525static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2564 struct btrfs_root *root, 2526 struct btrfs_root *root,
2565 struct extent_buffer *buf, 2527 struct extent_buffer *buf,
@@ -2662,7 +2624,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
2662 bi = btrfs_item_ptr_offset(leaf, path->slots[0]); 2624 bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2663 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); 2625 write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2664 btrfs_mark_buffer_dirty(leaf); 2626 btrfs_mark_buffer_dirty(leaf);
2665 btrfs_release_path(extent_root, path); 2627 btrfs_release_path(path);
2666fail: 2628fail:
2667 if (ret) 2629 if (ret)
2668 return ret; 2630 return ret;
@@ -2688,6 +2650,111 @@ next_block_group(struct btrfs_root *root,
2688 return cache; 2650 return cache;
2689} 2651}
2690 2652
2653static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2654 struct btrfs_trans_handle *trans,
2655 struct btrfs_path *path)
2656{
2657 struct btrfs_root *root = block_group->fs_info->tree_root;
2658 struct inode *inode = NULL;
2659 u64 alloc_hint = 0;
2660 int dcs = BTRFS_DC_ERROR;
2661 int num_pages = 0;
2662 int retries = 0;
2663 int ret = 0;
2664
2665 /*
2666 * If this block group is smaller than 100 megs don't bother caching the
2667 * block group.
2668 */
2669 if (block_group->key.offset < (100 * 1024 * 1024)) {
2670 spin_lock(&block_group->lock);
2671 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2672 spin_unlock(&block_group->lock);
2673 return 0;
2674 }
2675
2676again:
2677 inode = lookup_free_space_inode(root, block_group, path);
2678 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2679 ret = PTR_ERR(inode);
2680 btrfs_release_path(path);
2681 goto out;
2682 }
2683
2684 if (IS_ERR(inode)) {
2685 BUG_ON(retries);
2686 retries++;
2687
2688 if (block_group->ro)
2689 goto out_free;
2690
2691 ret = create_free_space_inode(root, trans, block_group, path);
2692 if (ret)
2693 goto out_free;
2694 goto again;
2695 }
2696
2697 /*
2698 * We want to set the generation to 0, that way if anything goes wrong
2699 * from here on out we know not to trust this cache when we load up next
2700 * time.
2701 */
2702 BTRFS_I(inode)->generation = 0;
2703 ret = btrfs_update_inode(trans, root, inode);
2704 WARN_ON(ret);
2705
2706 if (i_size_read(inode) > 0) {
2707 ret = btrfs_truncate_free_space_cache(root, trans, path,
2708 inode);
2709 if (ret)
2710 goto out_put;
2711 }
2712
2713 spin_lock(&block_group->lock);
2714 if (block_group->cached != BTRFS_CACHE_FINISHED) {
2715 /* We're not cached, don't bother trying to write stuff out */
2716 dcs = BTRFS_DC_WRITTEN;
2717 spin_unlock(&block_group->lock);
2718 goto out_put;
2719 }
2720 spin_unlock(&block_group->lock);
2721
2722 num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
2723 if (!num_pages)
2724 num_pages = 1;
2725
2726 /*
2727 * Just to make absolutely sure we have enough space, we're going to
2728 * preallocate 12 pages worth of space for each block group. In
2729 * practice we ought to use at most 8, but we need extra space so we can
2730 * add our header and have a terminator between the extents and the
2731 * bitmaps.
2732 */
2733 num_pages *= 16;
2734 num_pages *= PAGE_CACHE_SIZE;
2735
2736 ret = btrfs_check_data_free_space(inode, num_pages);
2737 if (ret)
2738 goto out_put;
2739
2740 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
2741 num_pages, num_pages,
2742 &alloc_hint);
2743 if (!ret)
2744 dcs = BTRFS_DC_SETUP;
2745 btrfs_free_reserved_data_space(inode, num_pages);
2746out_put:
2747 iput(inode);
2748out_free:
2749 btrfs_release_path(path);
2750out:
2751 spin_lock(&block_group->lock);
2752 block_group->disk_cache_state = dcs;
2753 spin_unlock(&block_group->lock);
2754
2755 return ret;
2756}
2757
2691int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, 2758int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2692 struct btrfs_root *root) 2759 struct btrfs_root *root)
2693{ 2760{
@@ -2700,6 +2767,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2700 if (!path) 2767 if (!path)
2701 return -ENOMEM; 2768 return -ENOMEM;
2702 2769
2770again:
2771 while (1) {
2772 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2773 while (cache) {
2774 if (cache->disk_cache_state == BTRFS_DC_CLEAR)
2775 break;
2776 cache = next_block_group(root, cache);
2777 }
2778 if (!cache) {
2779 if (last == 0)
2780 break;
2781 last = 0;
2782 continue;
2783 }
2784 err = cache_save_setup(cache, trans, path);
2785 last = cache->key.objectid + cache->key.offset;
2786 btrfs_put_block_group(cache);
2787 }
2788
2703 while (1) { 2789 while (1) {
2704 if (last == 0) { 2790 if (last == 0) {
2705 err = btrfs_run_delayed_refs(trans, root, 2791 err = btrfs_run_delayed_refs(trans, root,
@@ -2709,6 +2795,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2709 2795
2710 cache = btrfs_lookup_first_block_group(root->fs_info, last); 2796 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2711 while (cache) { 2797 while (cache) {
2798 if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
2799 btrfs_put_block_group(cache);
2800 goto again;
2801 }
2802
2712 if (cache->dirty) 2803 if (cache->dirty)
2713 break; 2804 break;
2714 cache = next_block_group(root, cache); 2805 cache = next_block_group(root, cache);
@@ -2720,6 +2811,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2720 continue; 2811 continue;
2721 } 2812 }
2722 2813
2814 if (cache->disk_cache_state == BTRFS_DC_SETUP)
2815 cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
2723 cache->dirty = 0; 2816 cache->dirty = 0;
2724 last = cache->key.objectid + cache->key.offset; 2817 last = cache->key.objectid + cache->key.offset;
2725 2818
@@ -2728,6 +2821,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
2728 btrfs_put_block_group(cache); 2821 btrfs_put_block_group(cache);
2729 } 2822 }
2730 2823
2824 while (1) {
2825 /*
2826 * I don't think this is needed since we're just marking our
2827 * preallocated extent as written, but just in case it can't
2828 * hurt.
2829 */
2830 if (last == 0) {
2831 err = btrfs_run_delayed_refs(trans, root,
2832 (unsigned long)-1);
2833 BUG_ON(err);
2834 }
2835
2836 cache = btrfs_lookup_first_block_group(root->fs_info, last);
2837 while (cache) {
2838 /*
2839 * Really this shouldn't happen, but it could if we
2840 * couldn't write the entire preallocated extent and
2841 * splitting the extent resulted in a new block.
2842 */
2843 if (cache->dirty) {
2844 btrfs_put_block_group(cache);
2845 goto again;
2846 }
2847 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2848 break;
2849 cache = next_block_group(root, cache);
2850 }
2851 if (!cache) {
2852 if (last == 0)
2853 break;
2854 last = 0;
2855 continue;
2856 }
2857
2858 btrfs_write_out_cache(root, trans, cache, path);
2859
2860 /*
2861 * If we didn't have an error then the cache state is still
2862 * NEED_WRITE, so we can set it to WRITTEN.
2863 */
2864 if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
2865 cache->disk_cache_state = BTRFS_DC_WRITTEN;
2866 last = cache->key.objectid + cache->key.offset;
2867 btrfs_put_block_group(cache);
2868 }
2869
2731 btrfs_free_path(path); 2870 btrfs_free_path(path);
2732 return 0; 2871 return 0;
2733} 2872}
@@ -2763,6 +2902,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2763 if (found) { 2902 if (found) {
2764 spin_lock(&found->lock); 2903 spin_lock(&found->lock);
2765 found->total_bytes += total_bytes; 2904 found->total_bytes += total_bytes;
2905 found->disk_total += total_bytes * factor;
2766 found->bytes_used += bytes_used; 2906 found->bytes_used += bytes_used;
2767 found->disk_used += bytes_used * factor; 2907 found->disk_used += bytes_used * factor;
2768 found->full = 0; 2908 found->full = 0;
@@ -2782,6 +2922,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2782 BTRFS_BLOCK_GROUP_SYSTEM | 2922 BTRFS_BLOCK_GROUP_SYSTEM |
2783 BTRFS_BLOCK_GROUP_METADATA); 2923 BTRFS_BLOCK_GROUP_METADATA);
2784 found->total_bytes = total_bytes; 2924 found->total_bytes = total_bytes;
2925 found->disk_total = total_bytes * factor;
2785 found->bytes_used = bytes_used; 2926 found->bytes_used = bytes_used;
2786 found->disk_used = bytes_used * factor; 2927 found->disk_used = bytes_used * factor;
2787 found->bytes_pinned = 0; 2928 found->bytes_pinned = 0;
@@ -2789,7 +2930,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
2789 found->bytes_readonly = 0; 2930 found->bytes_readonly = 0;
2790 found->bytes_may_use = 0; 2931 found->bytes_may_use = 0;
2791 found->full = 0; 2932 found->full = 0;
2792 found->force_alloc = 0; 2933 found->force_alloc = CHUNK_ALLOC_NO_FORCE;
2934 found->chunk_alloc = 0;
2793 *space_info = found; 2935 *space_info = found;
2794 list_add_rcu(&found->list, &info->space_info); 2936 list_add_rcu(&found->list, &info->space_info);
2795 atomic_set(&found->caching_threads, 0); 2937 atomic_set(&found->caching_threads, 0);
@@ -2814,7 +2956,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
2814 2956
2815u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) 2957u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
2816{ 2958{
2817 u64 num_devices = root->fs_info->fs_devices->rw_devices; 2959 /*
2960 * we add in the count of missing devices because we want
2961 * to make sure that any RAID levels on a degraded FS
2962 * continue to be honored.
2963 */
2964 u64 num_devices = root->fs_info->fs_devices->rw_devices +
2965 root->fs_info->fs_devices->missing_devices;
2818 2966
2819 if (num_devices == 1) 2967 if (num_devices == 1)
2820 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); 2968 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -2854,7 +3002,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
2854 return btrfs_reduce_alloc_profile(root, flags); 3002 return btrfs_reduce_alloc_profile(root, flags);
2855} 3003}
2856 3004
2857static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data) 3005u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
2858{ 3006{
2859 u64 flags; 3007 u64 flags;
2860 3008
@@ -2883,11 +3031,17 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
2883 struct btrfs_space_info *data_sinfo; 3031 struct btrfs_space_info *data_sinfo;
2884 struct btrfs_root *root = BTRFS_I(inode)->root; 3032 struct btrfs_root *root = BTRFS_I(inode)->root;
2885 u64 used; 3033 u64 used;
2886 int ret = 0, committed = 0; 3034 int ret = 0, committed = 0, alloc_chunk = 1;
2887 3035
2888 /* make sure bytes are sectorsize aligned */ 3036 /* make sure bytes are sectorsize aligned */
2889 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 3037 bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2890 3038
3039 if (root == root->fs_info->tree_root ||
3040 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3041 alloc_chunk = 0;
3042 committed = 1;
3043 }
3044
2891 data_sinfo = BTRFS_I(inode)->space_info; 3045 data_sinfo = BTRFS_I(inode)->space_info;
2892 if (!data_sinfo) 3046 if (!data_sinfo)
2893 goto alloc; 3047 goto alloc;
@@ -2906,23 +3060,28 @@ again:
2906 * if we don't have enough free bytes in this space then we need 3060 * if we don't have enough free bytes in this space then we need
2907 * to alloc a new chunk. 3061 * to alloc a new chunk.
2908 */ 3062 */
2909 if (!data_sinfo->full) { 3063 if (!data_sinfo->full && alloc_chunk) {
2910 u64 alloc_target; 3064 u64 alloc_target;
2911 3065
2912 data_sinfo->force_alloc = 1; 3066 data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
2913 spin_unlock(&data_sinfo->lock); 3067 spin_unlock(&data_sinfo->lock);
2914alloc: 3068alloc:
2915 alloc_target = btrfs_get_alloc_profile(root, 1); 3069 alloc_target = btrfs_get_alloc_profile(root, 1);
2916 trans = btrfs_join_transaction(root, 1); 3070 trans = btrfs_join_transaction(root);
2917 if (IS_ERR(trans)) 3071 if (IS_ERR(trans))
2918 return PTR_ERR(trans); 3072 return PTR_ERR(trans);
2919 3073
2920 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 3074 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2921 bytes + 2 * 1024 * 1024, 3075 bytes + 2 * 1024 * 1024,
2922 alloc_target, 0); 3076 alloc_target,
3077 CHUNK_ALLOC_NO_FORCE);
2923 btrfs_end_transaction(trans, root); 3078 btrfs_end_transaction(trans, root);
2924 if (ret < 0) 3079 if (ret < 0) {
2925 return ret; 3080 if (ret != -ENOSPC)
3081 return ret;
3082 else
3083 goto commit_trans;
3084 }
2926 3085
2927 if (!data_sinfo) { 3086 if (!data_sinfo) {
2928 btrfs_set_inode_space_info(root, inode); 3087 btrfs_set_inode_space_info(root, inode);
@@ -2930,12 +3089,21 @@ alloc:
2930 } 3089 }
2931 goto again; 3090 goto again;
2932 } 3091 }
3092
3093 /*
3094 * If we have less pinned bytes than we want to allocate then
3095 * don't bother committing the transaction, it won't help us.
3096 */
3097 if (data_sinfo->bytes_pinned < bytes)
3098 committed = 1;
2933 spin_unlock(&data_sinfo->lock); 3099 spin_unlock(&data_sinfo->lock);
2934 3100
2935 /* commit the current transaction and try again */ 3101 /* commit the current transaction and try again */
2936 if (!committed && !root->fs_info->open_ioctl_trans) { 3102commit_trans:
3103 if (!committed &&
3104 !atomic_read(&root->fs_info->open_ioctl_trans)) {
2937 committed = 1; 3105 committed = 1;
2938 trans = btrfs_join_transaction(root, 1); 3106 trans = btrfs_join_transaction(root);
2939 if (IS_ERR(trans)) 3107 if (IS_ERR(trans))
2940 return PTR_ERR(trans); 3108 return PTR_ERR(trans);
2941 ret = btrfs_commit_transaction(trans, root); 3109 ret = btrfs_commit_transaction(trans, root);
@@ -2944,18 +3112,6 @@ alloc:
2944 goto again; 3112 goto again;
2945 } 3113 }
2946 3114
2947#if 0 /* I hope we never need this code again, just in case */
2948 printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
2949 "%llu bytes_reserved, " "%llu bytes_pinned, "
2950 "%llu bytes_readonly, %llu may use %llu total\n",
2951 (unsigned long long)bytes,
2952 (unsigned long long)data_sinfo->bytes_used,
2953 (unsigned long long)data_sinfo->bytes_reserved,
2954 (unsigned long long)data_sinfo->bytes_pinned,
2955 (unsigned long long)data_sinfo->bytes_readonly,
2956 (unsigned long long)data_sinfo->bytes_may_use,
2957 (unsigned long long)data_sinfo->total_bytes);
2958#endif
2959 return -ENOSPC; 3115 return -ENOSPC;
2960 } 3116 }
2961 data_sinfo->bytes_may_use += bytes; 3117 data_sinfo->bytes_may_use += bytes;
@@ -2993,24 +3149,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
2993 rcu_read_lock(); 3149 rcu_read_lock();
2994 list_for_each_entry_rcu(found, head, list) { 3150 list_for_each_entry_rcu(found, head, list) {
2995 if (found->flags & BTRFS_BLOCK_GROUP_METADATA) 3151 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
2996 found->force_alloc = 1; 3152 found->force_alloc = CHUNK_ALLOC_FORCE;
2997 } 3153 }
2998 rcu_read_unlock(); 3154 rcu_read_unlock();
2999} 3155}
3000 3156
3001static int should_alloc_chunk(struct btrfs_space_info *sinfo, 3157static int should_alloc_chunk(struct btrfs_root *root,
3002 u64 alloc_bytes) 3158 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3159 int force)
3003{ 3160{
3004 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3161 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3162 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3163 u64 thresh;
3164
3165 if (force == CHUNK_ALLOC_FORCE)
3166 return 1;
3005 3167
3006 if (sinfo->bytes_used + sinfo->bytes_reserved + 3168 /*
3007 alloc_bytes + 256 * 1024 * 1024 < num_bytes) 3169 * in limited mode, we want to have some free space up to
3170 * about 1% of the FS size.
3171 */
3172 if (force == CHUNK_ALLOC_LIMITED) {
3173 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3174 thresh = max_t(u64, 64 * 1024 * 1024,
3175 div_factor_fine(thresh, 1));
3176
3177 if (num_bytes - num_allocated < thresh)
3178 return 1;
3179 }
3180
3181 /*
3182 * we have two similar checks here, one based on percentage
3183 * and once based on a hard number of 256MB. The idea
3184 * is that if we have a good amount of free
3185 * room, don't allocate a chunk. A good mount is
3186 * less than 80% utilized of the chunks we have allocated,
3187 * or more than 256MB free
3188 */
3189 if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
3008 return 0; 3190 return 0;
3009 3191
3010 if (sinfo->bytes_used + sinfo->bytes_reserved + 3192 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3011 alloc_bytes < div_factor(num_bytes, 8))
3012 return 0; 3193 return 0;
3013 3194
3195 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
3196
3197 /* 256MB or 5% of the FS */
3198 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
3199
3200 if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
3201 return 0;
3014 return 1; 3202 return 1;
3015} 3203}
3016 3204
@@ -3020,10 +3208,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3020{ 3208{
3021 struct btrfs_space_info *space_info; 3209 struct btrfs_space_info *space_info;
3022 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3210 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3211 int wait_for_alloc = 0;
3023 int ret = 0; 3212 int ret = 0;
3024 3213
3025 mutex_lock(&fs_info->chunk_mutex);
3026
3027 flags = btrfs_reduce_alloc_profile(extent_root, flags); 3214 flags = btrfs_reduce_alloc_profile(extent_root, flags);
3028 3215
3029 space_info = __find_space_info(extent_root->fs_info, flags); 3216 space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3034,20 +3221,47 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3034 } 3221 }
3035 BUG_ON(!space_info); 3222 BUG_ON(!space_info);
3036 3223
3224again:
3037 spin_lock(&space_info->lock); 3225 spin_lock(&space_info->lock);
3038 if (space_info->force_alloc) 3226 if (space_info->force_alloc)
3039 force = 1; 3227 force = space_info->force_alloc;
3040 if (space_info->full) { 3228 if (space_info->full) {
3041 spin_unlock(&space_info->lock); 3229 spin_unlock(&space_info->lock);
3042 goto out; 3230 return 0;
3043 } 3231 }
3044 3232
3045 if (!force && !should_alloc_chunk(space_info, alloc_bytes)) { 3233 if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
3046 spin_unlock(&space_info->lock); 3234 spin_unlock(&space_info->lock);
3047 goto out; 3235 return 0;
3236 } else if (space_info->chunk_alloc) {
3237 wait_for_alloc = 1;
3238 } else {
3239 space_info->chunk_alloc = 1;
3048 } 3240 }
3241
3049 spin_unlock(&space_info->lock); 3242 spin_unlock(&space_info->lock);
3050 3243
3244 mutex_lock(&fs_info->chunk_mutex);
3245
3246 /*
3247 * The chunk_mutex is held throughout the entirety of a chunk
3248 * allocation, so once we've acquired the chunk_mutex we know that the
3249 * other guy is done and we need to recheck and see if we should
3250 * allocate.
3251 */
3252 if (wait_for_alloc) {
3253 mutex_unlock(&fs_info->chunk_mutex);
3254 wait_for_alloc = 0;
3255 goto again;
3256 }
3257
3258 /*
3259 * If we have mixed data/metadata chunks we want to make sure we keep
3260 * allocating mixed chunks instead of individual chunks.
3261 */
3262 if (btrfs_mixed_space_info(space_info))
3263 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3264
3051 /* 3265 /*
3052 * if we're doing a data chunk, go ahead and make sure that 3266 * if we're doing a data chunk, go ahead and make sure that
3053 * we keep a reasonable number of metadata chunks allocated in the 3267 * we keep a reasonable number of metadata chunks allocated in the
@@ -3066,167 +3280,220 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3066 space_info->full = 1; 3280 space_info->full = 1;
3067 else 3281 else
3068 ret = 1; 3282 ret = 1;
3069 space_info->force_alloc = 0; 3283
3284 space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3285 space_info->chunk_alloc = 0;
3070 spin_unlock(&space_info->lock); 3286 spin_unlock(&space_info->lock);
3071out:
3072 mutex_unlock(&extent_root->fs_info->chunk_mutex); 3287 mutex_unlock(&extent_root->fs_info->chunk_mutex);
3073 return ret; 3288 return ret;
3074} 3289}
3075 3290
3076static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
3077 struct btrfs_root *root,
3078 struct btrfs_space_info *sinfo, u64 num_bytes)
3079{
3080 int ret;
3081 int end_trans = 0;
3082
3083 if (sinfo->full)
3084 return 0;
3085
3086 spin_lock(&sinfo->lock);
3087 ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
3088 spin_unlock(&sinfo->lock);
3089 if (!ret)
3090 return 0;
3091
3092 if (!trans) {
3093 trans = btrfs_join_transaction(root, 1);
3094 BUG_ON(IS_ERR(trans));
3095 end_trans = 1;
3096 }
3097
3098 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3099 num_bytes + 2 * 1024 * 1024,
3100 get_alloc_profile(root, sinfo->flags), 0);
3101
3102 if (end_trans)
3103 btrfs_end_transaction(trans, root);
3104
3105 return ret == 1 ? 1 : 0;
3106}
3107
3108/* 3291/*
3109 * shrink metadata reservation for delalloc 3292 * shrink metadata reservation for delalloc
3110 */ 3293 */
3111static int shrink_delalloc(struct btrfs_trans_handle *trans, 3294static int shrink_delalloc(struct btrfs_trans_handle *trans,
3112 struct btrfs_root *root, u64 to_reclaim) 3295 struct btrfs_root *root, u64 to_reclaim, int sync)
3113{ 3296{
3114 struct btrfs_block_rsv *block_rsv; 3297 struct btrfs_block_rsv *block_rsv;
3298 struct btrfs_space_info *space_info;
3115 u64 reserved; 3299 u64 reserved;
3116 u64 max_reclaim; 3300 u64 max_reclaim;
3117 u64 reclaimed = 0; 3301 u64 reclaimed = 0;
3118 int pause = 1; 3302 long time_left;
3119 int ret; 3303 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3304 int loops = 0;
3305 unsigned long progress;
3120 3306
3121 block_rsv = &root->fs_info->delalloc_block_rsv; 3307 block_rsv = &root->fs_info->delalloc_block_rsv;
3122 spin_lock(&block_rsv->lock); 3308 space_info = block_rsv->space_info;
3123 reserved = block_rsv->reserved; 3309
3124 spin_unlock(&block_rsv->lock); 3310 smp_mb();
3311 reserved = space_info->bytes_reserved;
3312 progress = space_info->reservation_progress;
3125 3313
3126 if (reserved == 0) 3314 if (reserved == 0)
3127 return 0; 3315 return 0;
3128 3316
3129 max_reclaim = min(reserved, to_reclaim); 3317 max_reclaim = min(reserved, to_reclaim);
3130 3318
3131 while (1) { 3319 while (loops < 1024) {
3132 ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0); 3320 /* have the flusher threads jump in and do some IO */
3133 if (!ret) { 3321 smp_mb();
3134 __set_current_state(TASK_INTERRUPTIBLE); 3322 nr_pages = min_t(unsigned long, nr_pages,
3135 schedule_timeout(pause); 3323 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3136 pause <<= 1; 3324 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
3137 if (pause > HZ / 10)
3138 pause = HZ / 10;
3139 } else {
3140 pause = 1;
3141 }
3142 3325
3143 spin_lock(&block_rsv->lock); 3326 spin_lock(&space_info->lock);
3144 if (reserved > block_rsv->reserved) 3327 if (reserved > space_info->bytes_reserved)
3145 reclaimed = reserved - block_rsv->reserved; 3328 reclaimed += reserved - space_info->bytes_reserved;
3146 reserved = block_rsv->reserved; 3329 reserved = space_info->bytes_reserved;
3147 spin_unlock(&block_rsv->lock); 3330 spin_unlock(&space_info->lock);
3331
3332 loops++;
3148 3333
3149 if (reserved == 0 || reclaimed >= max_reclaim) 3334 if (reserved == 0 || reclaimed >= max_reclaim)
3150 break; 3335 break;
3151 3336
3152 if (trans && trans->transaction->blocked) 3337 if (trans && trans->transaction->blocked)
3153 return -EAGAIN; 3338 return -EAGAIN;
3339
3340 time_left = schedule_timeout_interruptible(1);
3341
3342 /* We were interrupted, exit */
3343 if (time_left)
3344 break;
3345
3346 /* we've kicked the IO a few times, if anything has been freed,
3347 * exit. There is no sense in looping here for a long time
3348 * when we really need to commit the transaction, or there are
3349 * just too many writers without enough free space
3350 */
3351
3352 if (loops > 3) {
3353 smp_mb();
3354 if (progress != space_info->reservation_progress)
3355 break;
3356 }
3357
3154 } 3358 }
3155 return reclaimed >= to_reclaim; 3359 return reclaimed >= to_reclaim;
3156} 3360}
3157 3361
3158static int should_retry_reserve(struct btrfs_trans_handle *trans, 3362/*
3159 struct btrfs_root *root, 3363 * Retries tells us how many times we've called reserve_metadata_bytes. The
3160 struct btrfs_block_rsv *block_rsv, 3364 * idea is if this is the first call (retries == 0) then we will add to our
3161 u64 num_bytes, int *retries) 3365 * reserved count if we can't make the allocation in order to hold our place
3366 * while we go and try and free up space. That way for retries > 1 we don't try
3367 * and add space, we just check to see if the amount of unused space is >= the
3368 * total space, meaning that our reservation is valid.
3369 *
3370 * However if we don't intend to retry this reservation, pass -1 as retries so
3371 * that it short circuits this logic.
3372 */
3373static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
3374 struct btrfs_root *root,
3375 struct btrfs_block_rsv *block_rsv,
3376 u64 orig_bytes, int flush)
3162{ 3377{
3163 struct btrfs_space_info *space_info = block_rsv->space_info; 3378 struct btrfs_space_info *space_info = block_rsv->space_info;
3164 int ret; 3379 u64 unused;
3380 u64 num_bytes = orig_bytes;
3381 int retries = 0;
3382 int ret = 0;
3383 bool reserved = false;
3384 bool committed = false;
3165 3385
3166 if ((*retries) > 2) 3386again:
3167 return -ENOSPC; 3387 ret = -ENOSPC;
3388 if (reserved)
3389 num_bytes = 0;
3168 3390
3169 ret = maybe_allocate_chunk(trans, root, space_info, num_bytes); 3391 spin_lock(&space_info->lock);
3170 if (ret) 3392 unused = space_info->bytes_used + space_info->bytes_reserved +
3171 return 1; 3393 space_info->bytes_pinned + space_info->bytes_readonly +
3394 space_info->bytes_may_use;
3172 3395
3173 if (trans && trans->transaction->in_commit) 3396 /*
3174 return -ENOSPC; 3397 * The idea here is that we've not already over-reserved the block group
3398 * then we can go ahead and save our reservation first and then start
3399 * flushing if we need to. Otherwise if we've already overcommitted
3400 * lets start flushing stuff first and then come back and try to make
3401 * our reservation.
3402 */
3403 if (unused <= space_info->total_bytes) {
3404 unused = space_info->total_bytes - unused;
3405 if (unused >= num_bytes) {
3406 if (!reserved)
3407 space_info->bytes_reserved += orig_bytes;
3408 ret = 0;
3409 } else {
3410 /*
3411 * Ok set num_bytes to orig_bytes since we aren't
3412 * overocmmitted, this way we only try and reclaim what
3413 * we need.
3414 */
3415 num_bytes = orig_bytes;
3416 }
3417 } else {
3418 /*
3419 * Ok we're over committed, set num_bytes to the overcommitted
3420 * amount plus the amount of bytes that we need for this
3421 * reservation.
3422 */
3423 num_bytes = unused - space_info->total_bytes +
3424 (orig_bytes * (retries + 1));
3425 }
3175 3426
3176 ret = shrink_delalloc(trans, root, num_bytes); 3427 /*
3177 if (ret) 3428 * Couldn't make our reservation, save our place so while we're trying
3178 return ret; 3429 * to reclaim space we can actually use it instead of somebody else
3430 * stealing it from us.
3431 */
3432 if (ret && !reserved) {
3433 space_info->bytes_reserved += orig_bytes;
3434 reserved = true;
3435 }
3179 3436
3180 spin_lock(&space_info->lock);
3181 if (space_info->bytes_pinned < num_bytes)
3182 ret = 1;
3183 spin_unlock(&space_info->lock); 3437 spin_unlock(&space_info->lock);
3184 if (ret)
3185 return -ENOSPC;
3186 3438
3187 (*retries)++; 3439 if (!ret)
3188 3440 return 0;
3189 if (trans)
3190 return -EAGAIN;
3191 3441
3192 trans = btrfs_join_transaction(root, 1); 3442 if (!flush)
3193 BUG_ON(IS_ERR(trans)); 3443 goto out;
3194 ret = btrfs_commit_transaction(trans, root);
3195 BUG_ON(ret);
3196 3444
3197 return 1; 3445 /*
3198} 3446 * We do synchronous shrinking since we don't actually unreserve
3447 * metadata until after the IO is completed.
3448 */
3449 ret = shrink_delalloc(trans, root, num_bytes, 1);
3450 if (ret > 0)
3451 return 0;
3452 else if (ret < 0)
3453 goto out;
3199 3454
3200static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv, 3455 /*
3201 u64 num_bytes) 3456 * So if we were overcommitted it's possible that somebody else flushed
3202{ 3457 * out enough space and we simply didn't have enough space to reclaim,
3203 struct btrfs_space_info *space_info = block_rsv->space_info; 3458 * so go back around and try again.
3204 u64 unused; 3459 */
3205 int ret = -ENOSPC; 3460 if (retries < 2) {
3461 retries++;
3462 goto again;
3463 }
3206 3464
3207 spin_lock(&space_info->lock); 3465 spin_lock(&space_info->lock);
3208 unused = space_info->bytes_used + space_info->bytes_reserved + 3466 /*
3209 space_info->bytes_pinned + space_info->bytes_readonly; 3467 * Not enough space to be reclaimed, don't bother committing the
3468 * transaction.
3469 */
3470 if (space_info->bytes_pinned < orig_bytes)
3471 ret = -ENOSPC;
3472 spin_unlock(&space_info->lock);
3473 if (ret)
3474 goto out;
3210 3475
3211 if (unused < space_info->total_bytes) 3476 ret = -EAGAIN;
3212 unused = space_info->total_bytes - unused; 3477 if (trans || committed)
3213 else 3478 goto out;
3214 unused = 0;
3215 3479
3216 if (unused >= num_bytes) { 3480 ret = -ENOSPC;
3217 if (block_rsv->priority >= 10) { 3481 trans = btrfs_join_transaction(root);
3218 space_info->bytes_reserved += num_bytes; 3482 if (IS_ERR(trans))
3219 ret = 0; 3483 goto out;
3220 } else { 3484 ret = btrfs_commit_transaction(trans, root);
3221 if ((unused + block_rsv->reserved) * 3485 if (!ret) {
3222 block_rsv->priority >= 3486 trans = NULL;
3223 (num_bytes + block_rsv->reserved) * 10) { 3487 committed = true;
3224 space_info->bytes_reserved += num_bytes; 3488 goto again;
3225 ret = 0; 3489 }
3226 } 3490
3227 } 3491out:
3492 if (reserved) {
3493 spin_lock(&space_info->lock);
3494 space_info->bytes_reserved -= orig_bytes;
3495 spin_unlock(&space_info->lock);
3228 } 3496 }
3229 spin_unlock(&space_info->lock);
3230 3497
3231 return ret; 3498 return ret;
3232} 3499}
@@ -3273,8 +3540,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
3273 spin_unlock(&block_rsv->lock); 3540 spin_unlock(&block_rsv->lock);
3274} 3541}
3275 3542
3276void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv, 3543static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3277 struct btrfs_block_rsv *dest, u64 num_bytes) 3544 struct btrfs_block_rsv *dest, u64 num_bytes)
3278{ 3545{
3279 struct btrfs_space_info *space_info = block_rsv->space_info; 3546 struct btrfs_space_info *space_info = block_rsv->space_info;
3280 3547
@@ -3293,10 +3560,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3293 3560
3294 if (num_bytes > 0) { 3561 if (num_bytes > 0) {
3295 if (dest) { 3562 if (dest) {
3296 block_rsv_add_bytes(dest, num_bytes, 0); 3563 spin_lock(&dest->lock);
3297 } else { 3564 if (!dest->full) {
3565 u64 bytes_to_add;
3566
3567 bytes_to_add = dest->size - dest->reserved;
3568 bytes_to_add = min(num_bytes, bytes_to_add);
3569 dest->reserved += bytes_to_add;
3570 if (dest->reserved >= dest->size)
3571 dest->full = 1;
3572 num_bytes -= bytes_to_add;
3573 }
3574 spin_unlock(&dest->lock);
3575 }
3576 if (num_bytes) {
3298 spin_lock(&space_info->lock); 3577 spin_lock(&space_info->lock);
3299 space_info->bytes_reserved -= num_bytes; 3578 space_info->bytes_reserved -= num_bytes;
3579 space_info->reservation_progress++;
3300 spin_unlock(&space_info->lock); 3580 spin_unlock(&space_info->lock);
3301 } 3581 }
3302 } 3582 }
@@ -3328,18 +3608,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3328{ 3608{
3329 struct btrfs_block_rsv *block_rsv; 3609 struct btrfs_block_rsv *block_rsv;
3330 struct btrfs_fs_info *fs_info = root->fs_info; 3610 struct btrfs_fs_info *fs_info = root->fs_info;
3331 u64 alloc_target;
3332 3611
3333 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS); 3612 block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
3334 if (!block_rsv) 3613 if (!block_rsv)
3335 return NULL; 3614 return NULL;
3336 3615
3337 btrfs_init_block_rsv(block_rsv); 3616 btrfs_init_block_rsv(block_rsv);
3338
3339 alloc_target = btrfs_get_alloc_profile(root, 0);
3340 block_rsv->space_info = __find_space_info(fs_info, 3617 block_rsv->space_info = __find_space_info(fs_info,
3341 BTRFS_BLOCK_GROUP_METADATA); 3618 BTRFS_BLOCK_GROUP_METADATA);
3342
3343 return block_rsv; 3619 return block_rsv;
3344} 3620}
3345 3621
@@ -3370,23 +3646,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3370int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3646int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3371 struct btrfs_root *root, 3647 struct btrfs_root *root,
3372 struct btrfs_block_rsv *block_rsv, 3648 struct btrfs_block_rsv *block_rsv,
3373 u64 num_bytes, int *retries) 3649 u64 num_bytes)
3374{ 3650{
3375 int ret; 3651 int ret;
3376 3652
3377 if (num_bytes == 0) 3653 if (num_bytes == 0)
3378 return 0; 3654 return 0;
3379again: 3655
3380 ret = reserve_metadata_bytes(block_rsv, num_bytes); 3656 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
3381 if (!ret) { 3657 if (!ret) {
3382 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3658 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3383 return 0; 3659 return 0;
3384 } 3660 }
3385 3661
3386 ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
3387 if (ret > 0)
3388 goto again;
3389
3390 return ret; 3662 return ret;
3391} 3663}
3392 3664
@@ -3421,7 +3693,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3421 return 0; 3693 return 0;
3422 3694
3423 if (block_rsv->refill_used) { 3695 if (block_rsv->refill_used) {
3424 ret = reserve_metadata_bytes(block_rsv, num_bytes); 3696 ret = reserve_metadata_bytes(trans, root, block_rsv,
3697 num_bytes, 0);
3425 if (!ret) { 3698 if (!ret) {
3426 block_rsv_add_bytes(block_rsv, num_bytes, 0); 3699 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3427 return 0; 3700 return 0;
@@ -3432,17 +3705,12 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
3432 if (trans) 3705 if (trans)
3433 return -EAGAIN; 3706 return -EAGAIN;
3434 3707
3435 trans = btrfs_join_transaction(root, 1); 3708 trans = btrfs_join_transaction(root);
3436 BUG_ON(IS_ERR(trans)); 3709 BUG_ON(IS_ERR(trans));
3437 ret = btrfs_commit_transaction(trans, root); 3710 ret = btrfs_commit_transaction(trans, root);
3438 return 0; 3711 return 0;
3439 } 3712 }
3440 3713
3441 WARN_ON(1);
3442 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
3443 block_rsv->size, block_rsv->reserved,
3444 block_rsv->freed[0], block_rsv->freed[1]);
3445
3446 return -ENOSPC; 3714 return -ENOSPC;
3447} 3715}
3448 3716
@@ -3476,23 +3744,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3476 u64 meta_used; 3744 u64 meta_used;
3477 u64 data_used; 3745 u64 data_used;
3478 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3746 int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
3479#if 0
3480 /*
3481 * per tree used space accounting can be inaccuracy, so we
3482 * can't rely on it.
3483 */
3484 spin_lock(&fs_info->extent_root->accounting_lock);
3485 num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
3486 spin_unlock(&fs_info->extent_root->accounting_lock);
3487
3488 spin_lock(&fs_info->csum_root->accounting_lock);
3489 num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
3490 spin_unlock(&fs_info->csum_root->accounting_lock);
3491 3747
3492 spin_lock(&fs_info->tree_root->accounting_lock);
3493 num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
3494 spin_unlock(&fs_info->tree_root->accounting_lock);
3495#endif
3496 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3748 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3497 spin_lock(&sinfo->lock); 3749 spin_lock(&sinfo->lock);
3498 data_used = sinfo->bytes_used; 3750 data_used = sinfo->bytes_used;
@@ -3500,6 +3752,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3500 3752
3501 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3753 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3502 spin_lock(&sinfo->lock); 3754 spin_lock(&sinfo->lock);
3755 if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
3756 data_used = 0;
3503 meta_used = sinfo->bytes_used; 3757 meta_used = sinfo->bytes_used;
3504 spin_unlock(&sinfo->lock); 3758 spin_unlock(&sinfo->lock);
3505 3759
@@ -3527,7 +3781,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3527 block_rsv->size = num_bytes; 3781 block_rsv->size = num_bytes;
3528 3782
3529 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + 3783 num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
3530 sinfo->bytes_reserved + sinfo->bytes_readonly; 3784 sinfo->bytes_reserved + sinfo->bytes_readonly +
3785 sinfo->bytes_may_use;
3531 3786
3532 if (sinfo->total_bytes > num_bytes) { 3787 if (sinfo->total_bytes > num_bytes) {
3533 num_bytes = sinfo->total_bytes - num_bytes; 3788 num_bytes = sinfo->total_bytes - num_bytes;
@@ -3538,13 +3793,11 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3538 if (block_rsv->reserved >= block_rsv->size) { 3793 if (block_rsv->reserved >= block_rsv->size) {
3539 num_bytes = block_rsv->reserved - block_rsv->size; 3794 num_bytes = block_rsv->reserved - block_rsv->size;
3540 sinfo->bytes_reserved -= num_bytes; 3795 sinfo->bytes_reserved -= num_bytes;
3796 sinfo->reservation_progress++;
3541 block_rsv->reserved = block_rsv->size; 3797 block_rsv->reserved = block_rsv->size;
3542 block_rsv->full = 1; 3798 block_rsv->full = 1;
3543 } 3799 }
3544#if 0 3800
3545 printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
3546 block_rsv->size, block_rsv->reserved);
3547#endif
3548 spin_unlock(&sinfo->lock); 3801 spin_unlock(&sinfo->lock);
3549 spin_unlock(&block_rsv->lock); 3802 spin_unlock(&block_rsv->lock);
3550} 3803}
@@ -3590,15 +3843,40 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3590 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 3843 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3591} 3844}
3592 3845
3593static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items) 3846int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3847 struct btrfs_root *root,
3848 struct btrfs_block_rsv *rsv)
3594{ 3849{
3595 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 3850 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3596 3 * num_items; 3851 u64 num_bytes;
3852 int ret;
3853
3854 /*
3855 * Truncate should be freeing data, but give us 2 items just in case it
3856 * needs to use some space. We may want to be smarter about this in the
3857 * future.
3858 */
3859 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3860
3861 /* We already have enough bytes, just return */
3862 if (rsv->reserved >= num_bytes)
3863 return 0;
3864
3865 num_bytes -= rsv->reserved;
3866
3867 /*
3868 * You should have reserved enough space before hand to do this, so this
3869 * should not fail.
3870 */
3871 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3872 BUG_ON(ret);
3873
3874 return 0;
3597} 3875}
3598 3876
3599int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans, 3877int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3600 struct btrfs_root *root, 3878 struct btrfs_root *root,
3601 int num_items, int *retries) 3879 int num_items)
3602{ 3880{
3603 u64 num_bytes; 3881 u64 num_bytes;
3604 int ret; 3882 int ret;
@@ -3606,9 +3884,9 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
3606 if (num_items == 0 || root->fs_info->chunk_root == root) 3884 if (num_items == 0 || root->fs_info->chunk_root == root)
3607 return 0; 3885 return 0;
3608 3886
3609 num_bytes = calc_trans_metadata_size(root, num_items); 3887 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
3610 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv, 3888 ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
3611 num_bytes, retries); 3889 num_bytes);
3612 if (!ret) { 3890 if (!ret) {
3613 trans->bytes_reserved += num_bytes; 3891 trans->bytes_reserved += num_bytes;
3614 trans->block_rsv = &root->fs_info->trans_block_rsv; 3892 trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3636,23 +3914,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
3636 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv; 3914 struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
3637 3915
3638 /* 3916 /*
3639 * one for deleting orphan item, one for updating inode and 3917 * We need to hold space in order to delete our orphan item once we've
3640 * two for calling btrfs_truncate_inode_items. 3918 * added it, so this takes the reservation so we can release it later
3641 * 3919 * when we are truly done with the orphan item.
3642 * btrfs_truncate_inode_items is a delete operation, it frees
3643 * more space than it uses in most cases. So two units of
3644 * metadata space should be enough for calling it many times.
3645 * If all of the metadata space is used, we can commit
3646 * transaction and use space it freed.
3647 */ 3920 */
3648 u64 num_bytes = calc_trans_metadata_size(root, 4); 3921 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
3649 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 3922 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3650} 3923}
3651 3924
3652void btrfs_orphan_release_metadata(struct inode *inode) 3925void btrfs_orphan_release_metadata(struct inode *inode)
3653{ 3926{
3654 struct btrfs_root *root = BTRFS_I(inode)->root; 3927 struct btrfs_root *root = BTRFS_I(inode)->root;
3655 u64 num_bytes = calc_trans_metadata_size(root, 4); 3928 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
3656 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes); 3929 btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
3657} 3930}
3658 3931
@@ -3666,7 +3939,7 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3666 * two for root back/forward refs, two for directory entries 3939 * two for root back/forward refs, two for directory entries
3667 * and one for root of the snapshot. 3940 * and one for root of the snapshot.
3668 */ 3941 */
3669 u64 num_bytes = calc_trans_metadata_size(root, 5); 3942 u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3670 dst_rsv->space_info = src_rsv->space_info; 3943 dst_rsv->space_info = src_rsv->space_info;
3671 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 3944 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3672} 3945}
@@ -3682,43 +3955,37 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3682 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 3955 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
3683 u64 to_reserve; 3956 u64 to_reserve;
3684 int nr_extents; 3957 int nr_extents;
3685 int retries = 0; 3958 int reserved_extents;
3686 int ret; 3959 int ret;
3687 3960
3688 if (btrfs_transaction_in_commit(root->fs_info)) 3961 if (btrfs_transaction_in_commit(root->fs_info))
3689 schedule_timeout(1); 3962 schedule_timeout(1);
3690 3963
3691 num_bytes = ALIGN(num_bytes, root->sectorsize); 3964 num_bytes = ALIGN(num_bytes, root->sectorsize);
3692again: 3965
3693 spin_lock(&BTRFS_I(inode)->accounting_lock);
3694 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1; 3966 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
3695 if (nr_extents > BTRFS_I(inode)->reserved_extents) { 3967 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
3696 nr_extents -= BTRFS_I(inode)->reserved_extents; 3968
3697 to_reserve = calc_trans_metadata_size(root, nr_extents); 3969 if (nr_extents > reserved_extents) {
3970 nr_extents -= reserved_extents;
3971 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
3698 } else { 3972 } else {
3699 nr_extents = 0; 3973 nr_extents = 0;
3700 to_reserve = 0; 3974 to_reserve = 0;
3701 } 3975 }
3702 3976
3703 to_reserve += calc_csum_metadata_size(inode, num_bytes); 3977 to_reserve += calc_csum_metadata_size(inode, num_bytes);
3704 ret = reserve_metadata_bytes(block_rsv, to_reserve); 3978 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
3705 if (ret) { 3979 if (ret)
3706 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3707 ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
3708 &retries);
3709 if (ret > 0)
3710 goto again;
3711 return ret; 3980 return ret;
3712 }
3713 3981
3714 BTRFS_I(inode)->reserved_extents += nr_extents; 3982 atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
3715 atomic_inc(&BTRFS_I(inode)->outstanding_extents); 3983 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
3716 spin_unlock(&BTRFS_I(inode)->accounting_lock);
3717 3984
3718 block_rsv_add_bytes(block_rsv, to_reserve, 1); 3985 block_rsv_add_bytes(block_rsv, to_reserve, 1);
3719 3986
3720 if (block_rsv->size > 512 * 1024 * 1024) 3987 if (block_rsv->size > 512 * 1024 * 1024)
3721 shrink_delalloc(NULL, root, to_reserve); 3988 shrink_delalloc(NULL, root, to_reserve, 0);
3722 3989
3723 return 0; 3990 return 0;
3724} 3991}
@@ -3728,23 +3995,34 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
3728 struct btrfs_root *root = BTRFS_I(inode)->root; 3995 struct btrfs_root *root = BTRFS_I(inode)->root;
3729 u64 to_free; 3996 u64 to_free;
3730 int nr_extents; 3997 int nr_extents;
3998 int reserved_extents;
3731 3999
3732 num_bytes = ALIGN(num_bytes, root->sectorsize); 4000 num_bytes = ALIGN(num_bytes, root->sectorsize);
3733 atomic_dec(&BTRFS_I(inode)->outstanding_extents); 4001 atomic_dec(&BTRFS_I(inode)->outstanding_extents);
4002 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
3734 4003
3735 spin_lock(&BTRFS_I(inode)->accounting_lock); 4004 reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
3736 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents); 4005 do {
3737 if (nr_extents < BTRFS_I(inode)->reserved_extents) { 4006 int old, new;
3738 nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents; 4007
3739 BTRFS_I(inode)->reserved_extents -= nr_extents; 4008 nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
3740 } else { 4009 if (nr_extents >= reserved_extents) {
3741 nr_extents = 0; 4010 nr_extents = 0;
3742 } 4011 break;
3743 spin_unlock(&BTRFS_I(inode)->accounting_lock); 4012 }
4013 old = reserved_extents;
4014 nr_extents = reserved_extents - nr_extents;
4015 new = reserved_extents - nr_extents;
4016 old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
4017 reserved_extents, new);
4018 if (likely(old == reserved_extents))
4019 break;
4020 reserved_extents = old;
4021 } while (1);
3744 4022
3745 to_free = calc_csum_metadata_size(inode, num_bytes); 4023 to_free = calc_csum_metadata_size(inode, num_bytes);
3746 if (nr_extents > 0) 4024 if (nr_extents > 0)
3747 to_free += calc_trans_metadata_size(root, nr_extents); 4025 to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
3748 4026
3749 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, 4027 btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
3750 to_free); 4028 to_free);
@@ -3777,12 +4055,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3777 struct btrfs_root *root, 4055 struct btrfs_root *root,
3778 u64 bytenr, u64 num_bytes, int alloc) 4056 u64 bytenr, u64 num_bytes, int alloc)
3779{ 4057{
3780 struct btrfs_block_group_cache *cache; 4058 struct btrfs_block_group_cache *cache = NULL;
3781 struct btrfs_fs_info *info = root->fs_info; 4059 struct btrfs_fs_info *info = root->fs_info;
3782 int factor;
3783 u64 total = num_bytes; 4060 u64 total = num_bytes;
3784 u64 old_val; 4061 u64 old_val;
3785 u64 byte_in_group; 4062 u64 byte_in_group;
4063 int factor;
3786 4064
3787 /* block accounting for super block */ 4065 /* block accounting for super block */
3788 spin_lock(&info->delalloc_lock); 4066 spin_lock(&info->delalloc_lock);
@@ -3804,11 +4082,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3804 factor = 2; 4082 factor = 2;
3805 else 4083 else
3806 factor = 1; 4084 factor = 1;
4085 /*
4086 * If this block group has free space cache written out, we
4087 * need to make sure to load it if we are removing space. This
4088 * is because we need the unpinning stage to actually add the
4089 * space back to the block group, otherwise we will leak space.
4090 */
4091 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4092 cache_block_group(cache, trans, NULL, 1);
4093
3807 byte_in_group = bytenr - cache->key.objectid; 4094 byte_in_group = bytenr - cache->key.objectid;
3808 WARN_ON(byte_in_group > cache->key.offset); 4095 WARN_ON(byte_in_group > cache->key.offset);
3809 4096
3810 spin_lock(&cache->space_info->lock); 4097 spin_lock(&cache->space_info->lock);
3811 spin_lock(&cache->lock); 4098 spin_lock(&cache->lock);
4099
4100 if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
4101 cache->disk_cache_state < BTRFS_DC_CLEAR)
4102 cache->disk_cache_state = BTRFS_DC_CLEAR;
4103
3812 cache->dirty = 1; 4104 cache->dirty = 1;
3813 old_val = btrfs_block_group_used(&cache->item); 4105 old_val = btrfs_block_group_used(&cache->item);
3814 num_bytes = min(total, cache->key.offset - byte_in_group); 4106 num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -3817,6 +4109,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
3817 btrfs_set_block_group_used(&cache->item, old_val); 4109 btrfs_set_block_group_used(&cache->item, old_val);
3818 cache->reserved -= num_bytes; 4110 cache->reserved -= num_bytes;
3819 cache->space_info->bytes_reserved -= num_bytes; 4111 cache->space_info->bytes_reserved -= num_bytes;
4112 cache->space_info->reservation_progress++;
3820 cache->space_info->bytes_used += num_bytes; 4113 cache->space_info->bytes_used += num_bytes;
3821 cache->space_info->disk_used += num_bytes * factor; 4114 cache->space_info->disk_used += num_bytes * factor;
3822 spin_unlock(&cache->lock); 4115 spin_unlock(&cache->lock);
@@ -3868,6 +4161,7 @@ static int pin_down_extent(struct btrfs_root *root,
3868 if (reserved) { 4161 if (reserved) {
3869 cache->reserved -= num_bytes; 4162 cache->reserved -= num_bytes;
3870 cache->space_info->bytes_reserved -= num_bytes; 4163 cache->space_info->bytes_reserved -= num_bytes;
4164 cache->space_info->reservation_progress++;
3871 } 4165 }
3872 spin_unlock(&cache->lock); 4166 spin_unlock(&cache->lock);
3873 spin_unlock(&cache->space_info->lock); 4167 spin_unlock(&cache->space_info->lock);
@@ -3898,8 +4192,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
3898 * update size of reserved extents. this function may return -EAGAIN 4192 * update size of reserved extents. this function may return -EAGAIN
3899 * if 'reserve' is true or 'sinfo' is false. 4193 * if 'reserve' is true or 'sinfo' is false.
3900 */ 4194 */
3901static int update_reserved_bytes(struct btrfs_block_group_cache *cache, 4195int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
3902 u64 num_bytes, int reserve, int sinfo) 4196 u64 num_bytes, int reserve, int sinfo)
3903{ 4197{
3904 int ret = 0; 4198 int ret = 0;
3905 if (sinfo) { 4199 if (sinfo) {
@@ -3918,6 +4212,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
3918 space_info->bytes_readonly += num_bytes; 4212 space_info->bytes_readonly += num_bytes;
3919 cache->reserved -= num_bytes; 4213 cache->reserved -= num_bytes;
3920 space_info->bytes_reserved -= num_bytes; 4214 space_info->bytes_reserved -= num_bytes;
4215 space_info->reservation_progress++;
3921 } 4216 }
3922 spin_unlock(&cache->lock); 4217 spin_unlock(&cache->lock);
3923 spin_unlock(&space_info->lock); 4218 spin_unlock(&space_info->lock);
@@ -4037,7 +4332,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4037 if (ret) 4332 if (ret)
4038 break; 4333 break;
4039 4334
4040 ret = btrfs_discard_extent(root, start, end + 1 - start); 4335 if (btrfs_test_opt(root, DISCARD))
4336 ret = btrfs_discard_extent(root, start,
4337 end + 1 - start, NULL);
4041 4338
4042 clear_extent_dirty(unpin, start, end, GFP_NOFS); 4339 clear_extent_dirty(unpin, start, end, GFP_NOFS);
4043 unpin_extent_range(root, start, end); 4340 unpin_extent_range(root, start, end);
@@ -4134,7 +4431,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4134 NULL, refs_to_drop, 4431 NULL, refs_to_drop,
4135 is_data); 4432 is_data);
4136 BUG_ON(ret); 4433 BUG_ON(ret);
4137 btrfs_release_path(extent_root, path); 4434 btrfs_release_path(path);
4138 path->leave_spinning = 1; 4435 path->leave_spinning = 1;
4139 4436
4140 key.objectid = bytenr; 4437 key.objectid = bytenr;
@@ -4173,7 +4470,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4173 owner_objectid, 0); 4470 owner_objectid, 0);
4174 BUG_ON(ret < 0); 4471 BUG_ON(ret < 0);
4175 4472
4176 btrfs_release_path(extent_root, path); 4473 btrfs_release_path(path);
4177 path->leave_spinning = 1; 4474 path->leave_spinning = 1;
4178 4475
4179 key.objectid = bytenr; 4476 key.objectid = bytenr;
@@ -4243,7 +4540,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
4243 ret = btrfs_del_items(trans, extent_root, path, path->slots[0], 4540 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
4244 num_to_del); 4541 num_to_del);
4245 BUG_ON(ret); 4542 BUG_ON(ret);
4246 btrfs_release_path(extent_root, path); 4543 btrfs_release_path(path);
4247 4544
4248 if (is_data) { 4545 if (is_data) {
4249 ret = btrfs_del_csums(trans, root, bytenr, num_bytes); 4546 ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
@@ -4378,10 +4675,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4378 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4675 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4379 4676
4380 btrfs_add_free_space(cache, buf->start, buf->len); 4677 btrfs_add_free_space(cache, buf->start, buf->len);
4381 ret = update_reserved_bytes(cache, buf->len, 0, 0); 4678 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
4382 if (ret == -EAGAIN) { 4679 if (ret == -EAGAIN) {
4383 /* block group became read-only */ 4680 /* block group became read-only */
4384 update_reserved_bytes(cache, buf->len, 0, 1); 4681 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4385 goto out; 4682 goto out;
4386 } 4683 }
4387 4684
@@ -4396,6 +4693,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4396 if (ret) { 4693 if (ret) {
4397 spin_lock(&cache->space_info->lock); 4694 spin_lock(&cache->space_info->lock);
4398 cache->space_info->bytes_reserved -= buf->len; 4695 cache->space_info->bytes_reserved -= buf->len;
4696 cache->space_info->reservation_progress++;
4399 spin_unlock(&cache->space_info->lock); 4697 spin_unlock(&cache->space_info->lock);
4400 } 4698 }
4401 goto out; 4699 goto out;
@@ -4417,6 +4715,11 @@ pin:
4417 } 4715 }
4418 } 4716 }
4419out: 4717out:
4718 /*
4719 * Deleting the buffer, clear the corrupt flag since it doesn't matter
4720 * anymore.
4721 */
4722 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
4420 btrfs_put_block_group(cache); 4723 btrfs_put_block_group(cache);
4421} 4724}
4422 4725
@@ -4480,7 +4783,7 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
4480 return 0; 4783 return 0;
4481 4784
4482 wait_event(caching_ctl->wait, block_group_cache_done(cache) || 4785 wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
4483 (cache->free_space >= num_bytes)); 4786 (cache->free_space_ctl->free_space >= num_bytes));
4484 4787
4485 put_caching_control(caching_ctl); 4788 put_caching_control(caching_ctl);
4486 return 0; 4789 return 0;
@@ -4539,7 +4842,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4539 u64 num_bytes, u64 empty_size, 4842 u64 num_bytes, u64 empty_size,
4540 u64 search_start, u64 search_end, 4843 u64 search_start, u64 search_end,
4541 u64 hint_byte, struct btrfs_key *ins, 4844 u64 hint_byte, struct btrfs_key *ins,
4542 int data) 4845 u64 data)
4543{ 4846{
4544 int ret = 0; 4847 int ret = 0;
4545 struct btrfs_root *root = orig_root->fs_info->extent_root; 4848 struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -4555,6 +4858,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4555 bool found_uncached_bg = false; 4858 bool found_uncached_bg = false;
4556 bool failed_cluster_refill = false; 4859 bool failed_cluster_refill = false;
4557 bool failed_alloc = false; 4860 bool failed_alloc = false;
4861 bool use_cluster = true;
4558 u64 ideal_cache_percent = 0; 4862 u64 ideal_cache_percent = 0;
4559 u64 ideal_cache_offset = 0; 4863 u64 ideal_cache_offset = 0;
4560 4864
@@ -4565,20 +4869,28 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4565 4869
4566 space_info = __find_space_info(root->fs_info, data); 4870 space_info = __find_space_info(root->fs_info, data);
4567 if (!space_info) { 4871 if (!space_info) {
4568 printk(KERN_ERR "No space info for %d\n", data); 4872 printk(KERN_ERR "No space info for %llu\n", data);
4569 return -ENOSPC; 4873 return -ENOSPC;
4570 } 4874 }
4571 4875
4876 /*
4877 * If the space info is for both data and metadata it means we have a
4878 * small filesystem and we can't use the clustering stuff.
4879 */
4880 if (btrfs_mixed_space_info(space_info))
4881 use_cluster = false;
4882
4572 if (orig_root->ref_cows || empty_size) 4883 if (orig_root->ref_cows || empty_size)
4573 allowed_chunk_alloc = 1; 4884 allowed_chunk_alloc = 1;
4574 4885
4575 if (data & BTRFS_BLOCK_GROUP_METADATA) { 4886 if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
4576 last_ptr = &root->fs_info->meta_alloc_cluster; 4887 last_ptr = &root->fs_info->meta_alloc_cluster;
4577 if (!btrfs_test_opt(root, SSD)) 4888 if (!btrfs_test_opt(root, SSD))
4578 empty_cluster = 64 * 1024; 4889 empty_cluster = 64 * 1024;
4579 } 4890 }
4580 4891
4581 if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) { 4892 if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
4893 btrfs_test_opt(root, SSD)) {
4582 last_ptr = &root->fs_info->data_alloc_cluster; 4894 last_ptr = &root->fs_info->data_alloc_cluster;
4583 } 4895 }
4584 4896
@@ -4638,10 +4950,34 @@ search:
4638 btrfs_get_block_group(block_group); 4950 btrfs_get_block_group(block_group);
4639 search_start = block_group->key.objectid; 4951 search_start = block_group->key.objectid;
4640 4952
4953 /*
4954 * this can happen if we end up cycling through all the
4955 * raid types, but we want to make sure we only allocate
4956 * for the proper type.
4957 */
4958 if (!block_group_bits(block_group, data)) {
4959 u64 extra = BTRFS_BLOCK_GROUP_DUP |
4960 BTRFS_BLOCK_GROUP_RAID1 |
4961 BTRFS_BLOCK_GROUP_RAID10;
4962
4963 /*
4964 * if they asked for extra copies and this block group
4965 * doesn't provide them, bail. This does allow us to
4966 * fill raid0 from raid1.
4967 */
4968 if ((data & extra) && !(block_group->flags & extra))
4969 goto loop;
4970 }
4971
4641have_block_group: 4972have_block_group:
4642 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 4973 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
4643 u64 free_percent; 4974 u64 free_percent;
4644 4975
4976 ret = cache_block_group(block_group, trans,
4977 orig_root, 1);
4978 if (block_group->cached == BTRFS_CACHE_FINISHED)
4979 goto have_block_group;
4980
4645 free_percent = btrfs_block_group_used(&block_group->item); 4981 free_percent = btrfs_block_group_used(&block_group->item);
4646 free_percent *= 100; 4982 free_percent *= 100;
4647 free_percent = div64_u64(free_percent, 4983 free_percent = div64_u64(free_percent,
@@ -4662,7 +4998,8 @@ have_block_group:
4662 if (loop > LOOP_CACHING_NOWAIT || 4998 if (loop > LOOP_CACHING_NOWAIT ||
4663 (loop > LOOP_FIND_IDEAL && 4999 (loop > LOOP_FIND_IDEAL &&
4664 atomic_read(&space_info->caching_threads) < 2)) { 5000 atomic_read(&space_info->caching_threads) < 2)) {
4665 ret = cache_block_group(block_group); 5001 ret = cache_block_group(block_group, trans,
5002 orig_root, 0);
4666 BUG_ON(ret); 5003 BUG_ON(ret);
4667 } 5004 }
4668 found_uncached_bg = true; 5005 found_uncached_bg = true;
@@ -4682,6 +5019,15 @@ have_block_group:
4682 if (unlikely(block_group->ro)) 5019 if (unlikely(block_group->ro))
4683 goto loop; 5020 goto loop;
4684 5021
5022 spin_lock(&block_group->free_space_ctl->tree_lock);
5023 if (cached &&
5024 block_group->free_space_ctl->free_space <
5025 num_bytes + empty_size) {
5026 spin_unlock(&block_group->free_space_ctl->tree_lock);
5027 goto loop;
5028 }
5029 spin_unlock(&block_group->free_space_ctl->tree_lock);
5030
4685 /* 5031 /*
4686 * Ok we want to try and use the cluster allocator, so lets look 5032 * Ok we want to try and use the cluster allocator, so lets look
4687 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will 5033 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
@@ -4830,7 +5176,7 @@ checks:
4830 search_start - offset); 5176 search_start - offset);
4831 BUG_ON(offset > search_start); 5177 BUG_ON(offset > search_start);
4832 5178
4833 ret = update_reserved_bytes(block_group, num_bytes, 1, 5179 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
4834 (data & BTRFS_BLOCK_GROUP_DATA)); 5180 (data & BTRFS_BLOCK_GROUP_DATA));
4835 if (ret == -EAGAIN) { 5181 if (ret == -EAGAIN) {
4836 btrfs_add_free_space(block_group, offset, num_bytes); 5182 btrfs_add_free_space(block_group, offset, num_bytes);
@@ -4845,6 +5191,7 @@ checks:
4845 btrfs_add_free_space(block_group, offset, 5191 btrfs_add_free_space(block_group, offset,
4846 search_start - offset); 5192 search_start - offset);
4847 BUG_ON(offset > search_start); 5193 BUG_ON(offset > search_start);
5194 btrfs_put_block_group(block_group);
4848 break; 5195 break;
4849loop: 5196loop:
4850 failed_cluster_refill = false; 5197 failed_cluster_refill = false;
@@ -4867,9 +5214,7 @@ loop:
4867 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try 5214 * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
4868 * again 5215 * again
4869 */ 5216 */
4870 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && 5217 if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
4871 (found_uncached_bg || empty_size || empty_cluster ||
4872 allowed_chunk_alloc)) {
4873 index = 0; 5218 index = 0;
4874 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) { 5219 if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
4875 found_uncached_bg = false; 5220 found_uncached_bg = false;
@@ -4909,40 +5254,39 @@ loop:
4909 goto search; 5254 goto search;
4910 } 5255 }
4911 5256
4912 if (loop < LOOP_CACHING_WAIT) { 5257 loop++;
4913 loop++;
4914 goto search;
4915 }
4916 5258
4917 if (loop == LOOP_ALLOC_CHUNK) { 5259 if (loop == LOOP_ALLOC_CHUNK) {
4918 empty_size = 0; 5260 if (allowed_chunk_alloc) {
4919 empty_cluster = 0; 5261 ret = do_chunk_alloc(trans, root, num_bytes +
4920 } 5262 2 * 1024 * 1024, data,
5263 CHUNK_ALLOC_LIMITED);
5264 allowed_chunk_alloc = 0;
5265 if (ret == 1)
5266 done_chunk_alloc = 1;
5267 } else if (!done_chunk_alloc &&
5268 space_info->force_alloc ==
5269 CHUNK_ALLOC_NO_FORCE) {
5270 space_info->force_alloc = CHUNK_ALLOC_LIMITED;
5271 }
4921 5272
4922 if (allowed_chunk_alloc) { 5273 /*
4923 ret = do_chunk_alloc(trans, root, num_bytes + 5274 * We didn't allocate a chunk, go ahead and drop the
4924 2 * 1024 * 1024, data, 1); 5275 * empty size and loop again.
4925 allowed_chunk_alloc = 0; 5276 */
4926 done_chunk_alloc = 1; 5277 if (!done_chunk_alloc)
4927 } else if (!done_chunk_alloc) { 5278 loop = LOOP_NO_EMPTY_SIZE;
4928 space_info->force_alloc = 1;
4929 } 5279 }
4930 5280
4931 if (loop < LOOP_NO_EMPTY_SIZE) { 5281 if (loop == LOOP_NO_EMPTY_SIZE) {
4932 loop++; 5282 empty_size = 0;
4933 goto search; 5283 empty_cluster = 0;
4934 } 5284 }
4935 ret = -ENOSPC; 5285
5286 goto search;
4936 } else if (!ins->objectid) { 5287 } else if (!ins->objectid) {
4937 ret = -ENOSPC; 5288 ret = -ENOSPC;
4938 } 5289 } else if (ins->objectid) {
4939
4940 /* we found what we needed */
4941 if (ins->objectid) {
4942 if (!(data & BTRFS_BLOCK_GROUP_DATA))
4943 trans->block_group = block_group->key.objectid;
4944
4945 btrfs_put_block_group(block_group);
4946 ret = 0; 5290 ret = 0;
4947 } 5291 }
4948 5292
@@ -5011,7 +5355,8 @@ again:
5011 */ 5355 */
5012 if (empty_size || root->ref_cows) 5356 if (empty_size || root->ref_cows)
5013 ret = do_chunk_alloc(trans, root->fs_info->extent_root, 5357 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
5014 num_bytes + 2 * 1024 * 1024, data, 0); 5358 num_bytes + 2 * 1024 * 1024, data,
5359 CHUNK_ALLOC_NO_FORCE);
5015 5360
5016 WARN_ON(num_bytes < root->sectorsize); 5361 WARN_ON(num_bytes < root->sectorsize);
5017 ret = find_free_extent(trans, root, num_bytes, empty_size, 5362 ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5023,10 +5368,10 @@ again:
5023 num_bytes = num_bytes & ~(root->sectorsize - 1); 5368 num_bytes = num_bytes & ~(root->sectorsize - 1);
5024 num_bytes = max(num_bytes, min_alloc_size); 5369 num_bytes = max(num_bytes, min_alloc_size);
5025 do_chunk_alloc(trans, root->fs_info->extent_root, 5370 do_chunk_alloc(trans, root->fs_info->extent_root,
5026 num_bytes, data, 1); 5371 num_bytes, data, CHUNK_ALLOC_FORCE);
5027 goto again; 5372 goto again;
5028 } 5373 }
5029 if (ret == -ENOSPC) { 5374 if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
5030 struct btrfs_space_info *sinfo; 5375 struct btrfs_space_info *sinfo;
5031 5376
5032 sinfo = __find_space_info(root->fs_info, data); 5377 sinfo = __find_space_info(root->fs_info, data);
@@ -5036,6 +5381,8 @@ again:
5036 dump_space_info(sinfo, num_bytes, 1); 5381 dump_space_info(sinfo, num_bytes, 1);
5037 } 5382 }
5038 5383
5384 trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
5385
5039 return ret; 5386 return ret;
5040} 5387}
5041 5388
@@ -5051,12 +5398,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5051 return -ENOSPC; 5398 return -ENOSPC;
5052 } 5399 }
5053 5400
5054 ret = btrfs_discard_extent(root, start, len); 5401 if (btrfs_test_opt(root, DISCARD))
5402 ret = btrfs_discard_extent(root, start, len, NULL);
5055 5403
5056 btrfs_add_free_space(cache, start, len); 5404 btrfs_add_free_space(cache, start, len);
5057 update_reserved_bytes(cache, len, 0, 1); 5405 btrfs_update_reserved_bytes(cache, len, 0, 1);
5058 btrfs_put_block_group(cache); 5406 btrfs_put_block_group(cache);
5059 5407
5408 trace_btrfs_reserved_extent_free(root, start, len);
5409
5060 return ret; 5410 return ret;
5061} 5411}
5062 5412
@@ -5083,7 +5433,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5083 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type); 5433 size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
5084 5434
5085 path = btrfs_alloc_path(); 5435 path = btrfs_alloc_path();
5086 BUG_ON(!path); 5436 if (!path)
5437 return -ENOMEM;
5087 5438
5088 path->leave_spinning = 1; 5439 path->leave_spinning = 1;
5089 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, 5440 ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5219,7 +5570,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5219 u64 num_bytes = ins->offset; 5570 u64 num_bytes = ins->offset;
5220 5571
5221 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); 5572 block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
5222 cache_block_group(block_group); 5573 cache_block_group(block_group, trans, NULL, 0);
5223 caching_ctl = get_caching_control(block_group); 5574 caching_ctl = get_caching_control(block_group);
5224 5575
5225 if (!caching_ctl) { 5576 if (!caching_ctl) {
@@ -5253,7 +5604,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5253 put_caching_control(caching_ctl); 5604 put_caching_control(caching_ctl);
5254 } 5605 }
5255 5606
5256 ret = update_reserved_bytes(block_group, ins->offset, 1, 1); 5607 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
5257 BUG_ON(ret); 5608 BUG_ON(ret);
5258 btrfs_put_block_group(block_group); 5609 btrfs_put_block_group(block_group);
5259 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5610 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5304,25 +5655,47 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5304 struct btrfs_root *root, u32 blocksize) 5655 struct btrfs_root *root, u32 blocksize)
5305{ 5656{
5306 struct btrfs_block_rsv *block_rsv; 5657 struct btrfs_block_rsv *block_rsv;
5658 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
5307 int ret; 5659 int ret;
5308 5660
5309 block_rsv = get_block_rsv(trans, root); 5661 block_rsv = get_block_rsv(trans, root);
5310 5662
5311 if (block_rsv->size == 0) { 5663 if (block_rsv->size == 0) {
5312 ret = reserve_metadata_bytes(block_rsv, blocksize); 5664 ret = reserve_metadata_bytes(trans, root, block_rsv,
5313 if (ret) 5665 blocksize, 0);
5666 /*
5667 * If we couldn't reserve metadata bytes try and use some from
5668 * the global reserve.
5669 */
5670 if (ret && block_rsv != global_rsv) {
5671 ret = block_rsv_use_bytes(global_rsv, blocksize);
5672 if (!ret)
5673 return global_rsv;
5314 return ERR_PTR(ret); 5674 return ERR_PTR(ret);
5675 } else if (ret) {
5676 return ERR_PTR(ret);
5677 }
5315 return block_rsv; 5678 return block_rsv;
5316 } 5679 }
5317 5680
5318 ret = block_rsv_use_bytes(block_rsv, blocksize); 5681 ret = block_rsv_use_bytes(block_rsv, blocksize);
5319 if (!ret) 5682 if (!ret)
5320 return block_rsv; 5683 return block_rsv;
5321 5684 if (ret) {
5322 WARN_ON(1); 5685 WARN_ON(1);
5323 printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n", 5686 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
5324 block_rsv->size, block_rsv->reserved, 5687 0);
5325 block_rsv->freed[0], block_rsv->freed[1]); 5688 if (!ret) {
5689 spin_lock(&block_rsv->lock);
5690 block_rsv->size += blocksize;
5691 spin_unlock(&block_rsv->lock);
5692 return block_rsv;
5693 } else if (ret && block_rsv != global_rsv) {
5694 ret = block_rsv_use_bytes(global_rsv, blocksize);
5695 if (!ret)
5696 return global_rsv;
5697 }
5698 }
5326 5699
5327 return ERR_PTR(-ENOSPC); 5700 return ERR_PTR(-ENOSPC);
5328} 5701}
@@ -5422,7 +5795,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
5422 u64 generation; 5795 u64 generation;
5423 u64 refs; 5796 u64 refs;
5424 u64 flags; 5797 u64 flags;
5425 u64 last = 0;
5426 u32 nritems; 5798 u32 nritems;
5427 u32 blocksize; 5799 u32 blocksize;
5428 struct btrfs_key key; 5800 struct btrfs_key key;
@@ -5490,7 +5862,6 @@ reada:
5490 generation); 5862 generation);
5491 if (ret) 5863 if (ret)
5492 break; 5864 break;
5493 last = bytenr + blocksize;
5494 nread++; 5865 nread++;
5495 } 5866 }
5496 wc->reada_slot = slot; 5867 wc->reada_slot = slot;
@@ -5666,6 +6037,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
5666 if (reada && level == 1) 6037 if (reada && level == 1)
5667 reada_walk_down(trans, root, wc, path); 6038 reada_walk_down(trans, root, wc, path);
5668 next = read_tree_block(root, bytenr, blocksize, generation); 6039 next = read_tree_block(root, bytenr, blocksize, generation);
6040 if (!next)
6041 return -EIO;
5669 btrfs_tree_lock(next); 6042 btrfs_tree_lock(next);
5670 btrfs_set_lock_blocking(next); 6043 btrfs_set_lock_blocking(next);
5671 } 6044 }
@@ -5898,6 +6271,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
5898 BUG_ON(!wc); 6271 BUG_ON(!wc);
5899 6272
5900 trans = btrfs_start_transaction(tree_root, 0); 6273 trans = btrfs_start_transaction(tree_root, 0);
6274 BUG_ON(IS_ERR(trans));
6275
5901 if (block_rsv) 6276 if (block_rsv)
5902 trans->block_rsv = block_rsv; 6277 trans->block_rsv = block_rsv;
5903 6278
@@ -5995,11 +6370,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
5995 6370
5996 btrfs_end_transaction_throttle(trans, tree_root); 6371 btrfs_end_transaction_throttle(trans, tree_root);
5997 trans = btrfs_start_transaction(tree_root, 0); 6372 trans = btrfs_start_transaction(tree_root, 0);
6373 BUG_ON(IS_ERR(trans));
5998 if (block_rsv) 6374 if (block_rsv)
5999 trans->block_rsv = block_rsv; 6375 trans->block_rsv = block_rsv;
6000 } 6376 }
6001 } 6377 }
6002 btrfs_release_path(root, path); 6378 btrfs_release_path(path);
6003 BUG_ON(err); 6379 BUG_ON(err);
6004 6380
6005 ret = btrfs_del_root(trans, tree_root, &root->root_key); 6381 ret = btrfs_del_root(trans, tree_root, &root->root_key);
@@ -6010,9 +6386,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
6010 NULL, NULL); 6386 NULL, NULL);
6011 BUG_ON(ret < 0); 6387 BUG_ON(ret < 0);
6012 if (ret > 0) { 6388 if (ret > 0) {
6013 ret = btrfs_del_orphan_item(trans, tree_root, 6389 /* if we fail to delete the orphan item this time
6014 root->root_key.objectid); 6390 * around, it'll get picked up the next time.
6015 BUG_ON(ret); 6391 *
6392 * The most common failure here is just -ENOENT.
6393 */
6394 btrfs_del_orphan_item(trans, tree_root,
6395 root->root_key.objectid);
6016 } 6396 }
6017 } 6397 }
6018 6398
@@ -6050,10 +6430,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6050 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); 6430 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6051 6431
6052 path = btrfs_alloc_path(); 6432 path = btrfs_alloc_path();
6053 BUG_ON(!path); 6433 if (!path)
6434 return -ENOMEM;
6054 6435
6055 wc = kzalloc(sizeof(*wc), GFP_NOFS); 6436 wc = kzalloc(sizeof(*wc), GFP_NOFS);
6056 BUG_ON(!wc); 6437 if (!wc) {
6438 btrfs_free_path(path);
6439 return -ENOMEM;
6440 }
6057 6441
6058 btrfs_assert_tree_locked(parent); 6442 btrfs_assert_tree_locked(parent);
6059 parent_level = btrfs_header_level(parent); 6443 parent_level = btrfs_header_level(parent);
@@ -6095,1500 +6479,20 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
6095 return ret; 6479 return ret;
6096} 6480}
6097 6481
6098#if 0
6099static unsigned long calc_ra(unsigned long start, unsigned long last,
6100 unsigned long nr)
6101{
6102 return min(last, start + nr - 1);
6103}
6104
6105static noinline int relocate_inode_pages(struct inode *inode, u64 start,
6106 u64 len)
6107{
6108 u64 page_start;
6109 u64 page_end;
6110 unsigned long first_index;
6111 unsigned long last_index;
6112 unsigned long i;
6113 struct page *page;
6114 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
6115 struct file_ra_state *ra;
6116 struct btrfs_ordered_extent *ordered;
6117 unsigned int total_read = 0;
6118 unsigned int total_dirty = 0;
6119 int ret = 0;
6120
6121 ra = kzalloc(sizeof(*ra), GFP_NOFS);
6122
6123 mutex_lock(&inode->i_mutex);
6124 first_index = start >> PAGE_CACHE_SHIFT;
6125 last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
6126
6127 /* make sure the dirty trick played by the caller work */
6128 ret = invalidate_inode_pages2_range(inode->i_mapping,
6129 first_index, last_index);
6130 if (ret)
6131 goto out_unlock;
6132
6133 file_ra_state_init(ra, inode->i_mapping);
6134
6135 for (i = first_index ; i <= last_index; i++) {
6136 if (total_read % ra->ra_pages == 0) {
6137 btrfs_force_ra(inode->i_mapping, ra, NULL, i,
6138 calc_ra(i, last_index, ra->ra_pages));
6139 }
6140 total_read++;
6141again:
6142 if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
6143 BUG_ON(1);
6144 page = grab_cache_page(inode->i_mapping, i);
6145 if (!page) {
6146 ret = -ENOMEM;
6147 goto out_unlock;
6148 }
6149 if (!PageUptodate(page)) {
6150 btrfs_readpage(NULL, page);
6151 lock_page(page);
6152 if (!PageUptodate(page)) {
6153 unlock_page(page);
6154 page_cache_release(page);
6155 ret = -EIO;
6156 goto out_unlock;
6157 }
6158 }
6159 wait_on_page_writeback(page);
6160
6161 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
6162 page_end = page_start + PAGE_CACHE_SIZE - 1;
6163 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
6164
6165 ordered = btrfs_lookup_ordered_extent(inode, page_start);
6166 if (ordered) {
6167 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6168 unlock_page(page);
6169 page_cache_release(page);
6170 btrfs_start_ordered_extent(inode, ordered, 1);
6171 btrfs_put_ordered_extent(ordered);
6172 goto again;
6173 }
6174 set_page_extent_mapped(page);
6175
6176 if (i == first_index)
6177 set_extent_bits(io_tree, page_start, page_end,
6178 EXTENT_BOUNDARY, GFP_NOFS);
6179 btrfs_set_extent_delalloc(inode, page_start, page_end);
6180
6181 set_page_dirty(page);
6182 total_dirty++;
6183
6184 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
6185 unlock_page(page);
6186 page_cache_release(page);
6187 }
6188
6189out_unlock:
6190 kfree(ra);
6191 mutex_unlock(&inode->i_mutex);
6192 balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
6193 return ret;
6194}
6195
6196static noinline int relocate_data_extent(struct inode *reloc_inode,
6197 struct btrfs_key *extent_key,
6198 u64 offset)
6199{
6200 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6201 struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
6202 struct extent_map *em;
6203 u64 start = extent_key->objectid - offset;
6204 u64 end = start + extent_key->offset - 1;
6205
6206 em = alloc_extent_map(GFP_NOFS);
6207 BUG_ON(!em || IS_ERR(em));
6208
6209 em->start = start;
6210 em->len = extent_key->offset;
6211 em->block_len = extent_key->offset;
6212 em->block_start = extent_key->objectid;
6213 em->bdev = root->fs_info->fs_devices->latest_bdev;
6214 set_bit(EXTENT_FLAG_PINNED, &em->flags);
6215
6216 /* setup extent map to cheat btrfs_readpage */
6217 lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6218 while (1) {
6219 int ret;
6220 write_lock(&em_tree->lock);
6221 ret = add_extent_mapping(em_tree, em);
6222 write_unlock(&em_tree->lock);
6223 if (ret != -EEXIST) {
6224 free_extent_map(em);
6225 break;
6226 }
6227 btrfs_drop_extent_cache(reloc_inode, start, end, 0);
6228 }
6229 unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
6230
6231 return relocate_inode_pages(reloc_inode, start, extent_key->offset);
6232}
6233
6234struct btrfs_ref_path {
6235 u64 extent_start;
6236 u64 nodes[BTRFS_MAX_LEVEL];
6237 u64 root_objectid;
6238 u64 root_generation;
6239 u64 owner_objectid;
6240 u32 num_refs;
6241 int lowest_level;
6242 int current_level;
6243 int shared_level;
6244
6245 struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
6246 u64 new_nodes[BTRFS_MAX_LEVEL];
6247};
6248
6249struct disk_extent {
6250 u64 ram_bytes;
6251 u64 disk_bytenr;
6252 u64 disk_num_bytes;
6253 u64 offset;
6254 u64 num_bytes;
6255 u8 compression;
6256 u8 encryption;
6257 u16 other_encoding;
6258};
6259
6260static int is_cowonly_root(u64 root_objectid)
6261{
6262 if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
6263 root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
6264 root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
6265 root_objectid == BTRFS_DEV_TREE_OBJECTID ||
6266 root_objectid == BTRFS_TREE_LOG_OBJECTID ||
6267 root_objectid == BTRFS_CSUM_TREE_OBJECTID)
6268 return 1;
6269 return 0;
6270}
6271
6272static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
6273 struct btrfs_root *extent_root,
6274 struct btrfs_ref_path *ref_path,
6275 int first_time)
6276{
6277 struct extent_buffer *leaf;
6278 struct btrfs_path *path;
6279 struct btrfs_extent_ref *ref;
6280 struct btrfs_key key;
6281 struct btrfs_key found_key;
6282 u64 bytenr;
6283 u32 nritems;
6284 int level;
6285 int ret = 1;
6286
6287 path = btrfs_alloc_path();
6288 if (!path)
6289 return -ENOMEM;
6290
6291 if (first_time) {
6292 ref_path->lowest_level = -1;
6293 ref_path->current_level = -1;
6294 ref_path->shared_level = -1;
6295 goto walk_up;
6296 }
6297walk_down:
6298 level = ref_path->current_level - 1;
6299 while (level >= -1) {
6300 u64 parent;
6301 if (level < ref_path->lowest_level)
6302 break;
6303
6304 if (level >= 0)
6305 bytenr = ref_path->nodes[level];
6306 else
6307 bytenr = ref_path->extent_start;
6308 BUG_ON(bytenr == 0);
6309
6310 parent = ref_path->nodes[level + 1];
6311 ref_path->nodes[level + 1] = 0;
6312 ref_path->current_level = level;
6313 BUG_ON(parent == 0);
6314
6315 key.objectid = bytenr;
6316 key.offset = parent + 1;
6317 key.type = BTRFS_EXTENT_REF_KEY;
6318
6319 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6320 if (ret < 0)
6321 goto out;
6322 BUG_ON(ret == 0);
6323
6324 leaf = path->nodes[0];
6325 nritems = btrfs_header_nritems(leaf);
6326 if (path->slots[0] >= nritems) {
6327 ret = btrfs_next_leaf(extent_root, path);
6328 if (ret < 0)
6329 goto out;
6330 if (ret > 0)
6331 goto next;
6332 leaf = path->nodes[0];
6333 }
6334
6335 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6336 if (found_key.objectid == bytenr &&
6337 found_key.type == BTRFS_EXTENT_REF_KEY) {
6338 if (level < ref_path->shared_level)
6339 ref_path->shared_level = level;
6340 goto found;
6341 }
6342next:
6343 level--;
6344 btrfs_release_path(extent_root, path);
6345 cond_resched();
6346 }
6347 /* reached lowest level */
6348 ret = 1;
6349 goto out;
6350walk_up:
6351 level = ref_path->current_level;
6352 while (level < BTRFS_MAX_LEVEL - 1) {
6353 u64 ref_objectid;
6354
6355 if (level >= 0)
6356 bytenr = ref_path->nodes[level];
6357 else
6358 bytenr = ref_path->extent_start;
6359
6360 BUG_ON(bytenr == 0);
6361
6362 key.objectid = bytenr;
6363 key.offset = 0;
6364 key.type = BTRFS_EXTENT_REF_KEY;
6365
6366 ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
6367 if (ret < 0)
6368 goto out;
6369
6370 leaf = path->nodes[0];
6371 nritems = btrfs_header_nritems(leaf);
6372 if (path->slots[0] >= nritems) {
6373 ret = btrfs_next_leaf(extent_root, path);
6374 if (ret < 0)
6375 goto out;
6376 if (ret > 0) {
6377 /* the extent was freed by someone */
6378 if (ref_path->lowest_level == level)
6379 goto out;
6380 btrfs_release_path(extent_root, path);
6381 goto walk_down;
6382 }
6383 leaf = path->nodes[0];
6384 }
6385
6386 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6387 if (found_key.objectid != bytenr ||
6388 found_key.type != BTRFS_EXTENT_REF_KEY) {
6389 /* the extent was freed by someone */
6390 if (ref_path->lowest_level == level) {
6391 ret = 1;
6392 goto out;
6393 }
6394 btrfs_release_path(extent_root, path);
6395 goto walk_down;
6396 }
6397found:
6398 ref = btrfs_item_ptr(leaf, path->slots[0],
6399 struct btrfs_extent_ref);
6400 ref_objectid = btrfs_ref_objectid(leaf, ref);
6401 if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
6402 if (first_time) {
6403 level = (int)ref_objectid;
6404 BUG_ON(level >= BTRFS_MAX_LEVEL);
6405 ref_path->lowest_level = level;
6406 ref_path->current_level = level;
6407 ref_path->nodes[level] = bytenr;
6408 } else {
6409 WARN_ON(ref_objectid != level);
6410 }
6411 } else {
6412 WARN_ON(level != -1);
6413 }
6414 first_time = 0;
6415
6416 if (ref_path->lowest_level == level) {
6417 ref_path->owner_objectid = ref_objectid;
6418 ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
6419 }
6420
6421 /*
6422 * the block is tree root or the block isn't in reference
6423 * counted tree.
6424 */
6425 if (found_key.objectid == found_key.offset ||
6426 is_cowonly_root(btrfs_ref_root(leaf, ref))) {
6427 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6428 ref_path->root_generation =
6429 btrfs_ref_generation(leaf, ref);
6430 if (level < 0) {
6431 /* special reference from the tree log */
6432 ref_path->nodes[0] = found_key.offset;
6433 ref_path->current_level = 0;
6434 }
6435 ret = 0;
6436 goto out;
6437 }
6438
6439 level++;
6440 BUG_ON(ref_path->nodes[level] != 0);
6441 ref_path->nodes[level] = found_key.offset;
6442 ref_path->current_level = level;
6443
6444 /*
6445 * the reference was created in the running transaction,
6446 * no need to continue walking up.
6447 */
6448 if (btrfs_ref_generation(leaf, ref) == trans->transid) {
6449 ref_path->root_objectid = btrfs_ref_root(leaf, ref);
6450 ref_path->root_generation =
6451 btrfs_ref_generation(leaf, ref);
6452 ret = 0;
6453 goto out;
6454 }
6455
6456 btrfs_release_path(extent_root, path);
6457 cond_resched();
6458 }
6459 /* reached max tree level, but no tree root found. */
6460 BUG();
6461out:
6462 btrfs_free_path(path);
6463 return ret;
6464}
6465
6466static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
6467 struct btrfs_root *extent_root,
6468 struct btrfs_ref_path *ref_path,
6469 u64 extent_start)
6470{
6471 memset(ref_path, 0, sizeof(*ref_path));
6472 ref_path->extent_start = extent_start;
6473
6474 return __next_ref_path(trans, extent_root, ref_path, 1);
6475}
6476
6477static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
6478 struct btrfs_root *extent_root,
6479 struct btrfs_ref_path *ref_path)
6480{
6481 return __next_ref_path(trans, extent_root, ref_path, 0);
6482}
6483
6484static noinline int get_new_locations(struct inode *reloc_inode,
6485 struct btrfs_key *extent_key,
6486 u64 offset, int no_fragment,
6487 struct disk_extent **extents,
6488 int *nr_extents)
6489{
6490 struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
6491 struct btrfs_path *path;
6492 struct btrfs_file_extent_item *fi;
6493 struct extent_buffer *leaf;
6494 struct disk_extent *exts = *extents;
6495 struct btrfs_key found_key;
6496 u64 cur_pos;
6497 u64 last_byte;
6498 u32 nritems;
6499 int nr = 0;
6500 int max = *nr_extents;
6501 int ret;
6502
6503 WARN_ON(!no_fragment && *extents);
6504 if (!exts) {
6505 max = 1;
6506 exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
6507 if (!exts)
6508 return -ENOMEM;
6509 }
6510
6511 path = btrfs_alloc_path();
6512 BUG_ON(!path);
6513
6514 cur_pos = extent_key->objectid - offset;
6515 last_byte = extent_key->objectid + extent_key->offset;
6516 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
6517 cur_pos, 0);
6518 if (ret < 0)
6519 goto out;
6520 if (ret > 0) {
6521 ret = -ENOENT;
6522 goto out;
6523 }
6524
6525 while (1) {
6526 leaf = path->nodes[0];
6527 nritems = btrfs_header_nritems(leaf);
6528 if (path->slots[0] >= nritems) {
6529 ret = btrfs_next_leaf(root, path);
6530 if (ret < 0)
6531 goto out;
6532 if (ret > 0)
6533 break;
6534 leaf = path->nodes[0];
6535 }
6536
6537 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6538 if (found_key.offset != cur_pos ||
6539 found_key.type != BTRFS_EXTENT_DATA_KEY ||
6540 found_key.objectid != reloc_inode->i_ino)
6541 break;
6542
6543 fi = btrfs_item_ptr(leaf, path->slots[0],
6544 struct btrfs_file_extent_item);
6545 if (btrfs_file_extent_type(leaf, fi) !=
6546 BTRFS_FILE_EXTENT_REG ||
6547 btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6548 break;
6549
6550 if (nr == max) {
6551 struct disk_extent *old = exts;
6552 max *= 2;
6553 exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
6554 memcpy(exts, old, sizeof(*exts) * nr);
6555 if (old != *extents)
6556 kfree(old);
6557 }
6558
6559 exts[nr].disk_bytenr =
6560 btrfs_file_extent_disk_bytenr(leaf, fi);
6561 exts[nr].disk_num_bytes =
6562 btrfs_file_extent_disk_num_bytes(leaf, fi);
6563 exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
6564 exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6565 exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6566 exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
6567 exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
6568 exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
6569 fi);
6570 BUG_ON(exts[nr].offset > 0);
6571 BUG_ON(exts[nr].compression || exts[nr].encryption);
6572 BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
6573
6574 cur_pos += exts[nr].num_bytes;
6575 nr++;
6576
6577 if (cur_pos + offset >= last_byte)
6578 break;
6579
6580 if (no_fragment) {
6581 ret = 1;
6582 goto out;
6583 }
6584 path->slots[0]++;
6585 }
6586
6587 BUG_ON(cur_pos + offset > last_byte);
6588 if (cur_pos + offset < last_byte) {
6589 ret = -ENOENT;
6590 goto out;
6591 }
6592 ret = 0;
6593out:
6594 btrfs_free_path(path);
6595 if (ret) {
6596 if (exts != *extents)
6597 kfree(exts);
6598 } else {
6599 *extents = exts;
6600 *nr_extents = nr;
6601 }
6602 return ret;
6603}
6604
6605static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
6606 struct btrfs_root *root,
6607 struct btrfs_path *path,
6608 struct btrfs_key *extent_key,
6609 struct btrfs_key *leaf_key,
6610 struct btrfs_ref_path *ref_path,
6611 struct disk_extent *new_extents,
6612 int nr_extents)
6613{
6614 struct extent_buffer *leaf;
6615 struct btrfs_file_extent_item *fi;
6616 struct inode *inode = NULL;
6617 struct btrfs_key key;
6618 u64 lock_start = 0;
6619 u64 lock_end = 0;
6620 u64 num_bytes;
6621 u64 ext_offset;
6622 u64 search_end = (u64)-1;
6623 u32 nritems;
6624 int nr_scaned = 0;
6625 int extent_locked = 0;
6626 int extent_type;
6627 int ret;
6628
6629 memcpy(&key, leaf_key, sizeof(key));
6630 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6631 if (key.objectid < ref_path->owner_objectid ||
6632 (key.objectid == ref_path->owner_objectid &&
6633 key.type < BTRFS_EXTENT_DATA_KEY)) {
6634 key.objectid = ref_path->owner_objectid;
6635 key.type = BTRFS_EXTENT_DATA_KEY;
6636 key.offset = 0;
6637 }
6638 }
6639
6640 while (1) {
6641 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
6642 if (ret < 0)
6643 goto out;
6644
6645 leaf = path->nodes[0];
6646 nritems = btrfs_header_nritems(leaf);
6647next:
6648 if (extent_locked && ret > 0) {
6649 /*
6650 * the file extent item was modified by someone
6651 * before the extent got locked.
6652 */
6653 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6654 lock_end, GFP_NOFS);
6655 extent_locked = 0;
6656 }
6657
6658 if (path->slots[0] >= nritems) {
6659 if (++nr_scaned > 2)
6660 break;
6661
6662 BUG_ON(extent_locked);
6663 ret = btrfs_next_leaf(root, path);
6664 if (ret < 0)
6665 goto out;
6666 if (ret > 0)
6667 break;
6668 leaf = path->nodes[0];
6669 nritems = btrfs_header_nritems(leaf);
6670 }
6671
6672 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6673
6674 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
6675 if ((key.objectid > ref_path->owner_objectid) ||
6676 (key.objectid == ref_path->owner_objectid &&
6677 key.type > BTRFS_EXTENT_DATA_KEY) ||
6678 key.offset >= search_end)
6679 break;
6680 }
6681
6682 if (inode && key.objectid != inode->i_ino) {
6683 BUG_ON(extent_locked);
6684 btrfs_release_path(root, path);
6685 mutex_unlock(&inode->i_mutex);
6686 iput(inode);
6687 inode = NULL;
6688 continue;
6689 }
6690
6691 if (key.type != BTRFS_EXTENT_DATA_KEY) {
6692 path->slots[0]++;
6693 ret = 1;
6694 goto next;
6695 }
6696 fi = btrfs_item_ptr(leaf, path->slots[0],
6697 struct btrfs_file_extent_item);
6698 extent_type = btrfs_file_extent_type(leaf, fi);
6699 if ((extent_type != BTRFS_FILE_EXTENT_REG &&
6700 extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
6701 (btrfs_file_extent_disk_bytenr(leaf, fi) !=
6702 extent_key->objectid)) {
6703 path->slots[0]++;
6704 ret = 1;
6705 goto next;
6706 }
6707
6708 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6709 ext_offset = btrfs_file_extent_offset(leaf, fi);
6710
6711 if (search_end == (u64)-1) {
6712 search_end = key.offset - ext_offset +
6713 btrfs_file_extent_ram_bytes(leaf, fi);
6714 }
6715
6716 if (!extent_locked) {
6717 lock_start = key.offset;
6718 lock_end = lock_start + num_bytes - 1;
6719 } else {
6720 if (lock_start > key.offset ||
6721 lock_end + 1 < key.offset + num_bytes) {
6722 unlock_extent(&BTRFS_I(inode)->io_tree,
6723 lock_start, lock_end, GFP_NOFS);
6724 extent_locked = 0;
6725 }
6726 }
6727
6728 if (!inode) {
6729 btrfs_release_path(root, path);
6730
6731 inode = btrfs_iget_locked(root->fs_info->sb,
6732 key.objectid, root);
6733 if (inode->i_state & I_NEW) {
6734 BTRFS_I(inode)->root = root;
6735 BTRFS_I(inode)->location.objectid =
6736 key.objectid;
6737 BTRFS_I(inode)->location.type =
6738 BTRFS_INODE_ITEM_KEY;
6739 BTRFS_I(inode)->location.offset = 0;
6740 btrfs_read_locked_inode(inode);
6741 unlock_new_inode(inode);
6742 }
6743 /*
6744 * some code call btrfs_commit_transaction while
6745 * holding the i_mutex, so we can't use mutex_lock
6746 * here.
6747 */
6748 if (is_bad_inode(inode) ||
6749 !mutex_trylock(&inode->i_mutex)) {
6750 iput(inode);
6751 inode = NULL;
6752 key.offset = (u64)-1;
6753 goto skip;
6754 }
6755 }
6756
6757 if (!extent_locked) {
6758 struct btrfs_ordered_extent *ordered;
6759
6760 btrfs_release_path(root, path);
6761
6762 lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6763 lock_end, GFP_NOFS);
6764 ordered = btrfs_lookup_first_ordered_extent(inode,
6765 lock_end);
6766 if (ordered &&
6767 ordered->file_offset <= lock_end &&
6768 ordered->file_offset + ordered->len > lock_start) {
6769 unlock_extent(&BTRFS_I(inode)->io_tree,
6770 lock_start, lock_end, GFP_NOFS);
6771 btrfs_start_ordered_extent(inode, ordered, 1);
6772 btrfs_put_ordered_extent(ordered);
6773 key.offset += num_bytes;
6774 goto skip;
6775 }
6776 if (ordered)
6777 btrfs_put_ordered_extent(ordered);
6778
6779 extent_locked = 1;
6780 continue;
6781 }
6782
6783 if (nr_extents == 1) {
6784 /* update extent pointer in place */
6785 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6786 new_extents[0].disk_bytenr);
6787 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6788 new_extents[0].disk_num_bytes);
6789 btrfs_mark_buffer_dirty(leaf);
6790
6791 btrfs_drop_extent_cache(inode, key.offset,
6792 key.offset + num_bytes - 1, 0);
6793
6794 ret = btrfs_inc_extent_ref(trans, root,
6795 new_extents[0].disk_bytenr,
6796 new_extents[0].disk_num_bytes,
6797 leaf->start,
6798 root->root_key.objectid,
6799 trans->transid,
6800 key.objectid);
6801 BUG_ON(ret);
6802
6803 ret = btrfs_free_extent(trans, root,
6804 extent_key->objectid,
6805 extent_key->offset,
6806 leaf->start,
6807 btrfs_header_owner(leaf),
6808 btrfs_header_generation(leaf),
6809 key.objectid, 0);
6810 BUG_ON(ret);
6811
6812 btrfs_release_path(root, path);
6813 key.offset += num_bytes;
6814 } else {
6815 BUG_ON(1);
6816#if 0
6817 u64 alloc_hint;
6818 u64 extent_len;
6819 int i;
6820 /*
6821 * drop old extent pointer at first, then insert the
6822 * new pointers one bye one
6823 */
6824 btrfs_release_path(root, path);
6825 ret = btrfs_drop_extents(trans, root, inode, key.offset,
6826 key.offset + num_bytes,
6827 key.offset, &alloc_hint);
6828 BUG_ON(ret);
6829
6830 for (i = 0; i < nr_extents; i++) {
6831 if (ext_offset >= new_extents[i].num_bytes) {
6832 ext_offset -= new_extents[i].num_bytes;
6833 continue;
6834 }
6835 extent_len = min(new_extents[i].num_bytes -
6836 ext_offset, num_bytes);
6837
6838 ret = btrfs_insert_empty_item(trans, root,
6839 path, &key,
6840 sizeof(*fi));
6841 BUG_ON(ret);
6842
6843 leaf = path->nodes[0];
6844 fi = btrfs_item_ptr(leaf, path->slots[0],
6845 struct btrfs_file_extent_item);
6846 btrfs_set_file_extent_generation(leaf, fi,
6847 trans->transid);
6848 btrfs_set_file_extent_type(leaf, fi,
6849 BTRFS_FILE_EXTENT_REG);
6850 btrfs_set_file_extent_disk_bytenr(leaf, fi,
6851 new_extents[i].disk_bytenr);
6852 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
6853 new_extents[i].disk_num_bytes);
6854 btrfs_set_file_extent_ram_bytes(leaf, fi,
6855 new_extents[i].ram_bytes);
6856
6857 btrfs_set_file_extent_compression(leaf, fi,
6858 new_extents[i].compression);
6859 btrfs_set_file_extent_encryption(leaf, fi,
6860 new_extents[i].encryption);
6861 btrfs_set_file_extent_other_encoding(leaf, fi,
6862 new_extents[i].other_encoding);
6863
6864 btrfs_set_file_extent_num_bytes(leaf, fi,
6865 extent_len);
6866 ext_offset += new_extents[i].offset;
6867 btrfs_set_file_extent_offset(leaf, fi,
6868 ext_offset);
6869 btrfs_mark_buffer_dirty(leaf);
6870
6871 btrfs_drop_extent_cache(inode, key.offset,
6872 key.offset + extent_len - 1, 0);
6873
6874 ret = btrfs_inc_extent_ref(trans, root,
6875 new_extents[i].disk_bytenr,
6876 new_extents[i].disk_num_bytes,
6877 leaf->start,
6878 root->root_key.objectid,
6879 trans->transid, key.objectid);
6880 BUG_ON(ret);
6881 btrfs_release_path(root, path);
6882
6883 inode_add_bytes(inode, extent_len);
6884
6885 ext_offset = 0;
6886 num_bytes -= extent_len;
6887 key.offset += extent_len;
6888
6889 if (num_bytes == 0)
6890 break;
6891 }
6892 BUG_ON(i >= nr_extents);
6893#endif
6894 }
6895
6896 if (extent_locked) {
6897 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6898 lock_end, GFP_NOFS);
6899 extent_locked = 0;
6900 }
6901skip:
6902 if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
6903 key.offset >= search_end)
6904 break;
6905
6906 cond_resched();
6907 }
6908 ret = 0;
6909out:
6910 btrfs_release_path(root, path);
6911 if (inode) {
6912 mutex_unlock(&inode->i_mutex);
6913 if (extent_locked) {
6914 unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
6915 lock_end, GFP_NOFS);
6916 }
6917 iput(inode);
6918 }
6919 return ret;
6920}
6921
6922int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
6923 struct btrfs_root *root,
6924 struct extent_buffer *buf, u64 orig_start)
6925{
6926 int level;
6927 int ret;
6928
6929 BUG_ON(btrfs_header_generation(buf) != trans->transid);
6930 BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
6931
6932 level = btrfs_header_level(buf);
6933 if (level == 0) {
6934 struct btrfs_leaf_ref *ref;
6935 struct btrfs_leaf_ref *orig_ref;
6936
6937 orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
6938 if (!orig_ref)
6939 return -ENOENT;
6940
6941 ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
6942 if (!ref) {
6943 btrfs_free_leaf_ref(root, orig_ref);
6944 return -ENOMEM;
6945 }
6946
6947 ref->nritems = orig_ref->nritems;
6948 memcpy(ref->extents, orig_ref->extents,
6949 sizeof(ref->extents[0]) * ref->nritems);
6950
6951 btrfs_free_leaf_ref(root, orig_ref);
6952
6953 ref->root_gen = trans->transid;
6954 ref->bytenr = buf->start;
6955 ref->owner = btrfs_header_owner(buf);
6956 ref->generation = btrfs_header_generation(buf);
6957
6958 ret = btrfs_add_leaf_ref(root, ref, 0);
6959 WARN_ON(ret);
6960 btrfs_free_leaf_ref(root, ref);
6961 }
6962 return 0;
6963}
6964
6965static noinline int invalidate_extent_cache(struct btrfs_root *root,
6966 struct extent_buffer *leaf,
6967 struct btrfs_block_group_cache *group,
6968 struct btrfs_root *target_root)
6969{
6970 struct btrfs_key key;
6971 struct inode *inode = NULL;
6972 struct btrfs_file_extent_item *fi;
6973 struct extent_state *cached_state = NULL;
6974 u64 num_bytes;
6975 u64 skip_objectid = 0;
6976 u32 nritems;
6977 u32 i;
6978
6979 nritems = btrfs_header_nritems(leaf);
6980 for (i = 0; i < nritems; i++) {
6981 btrfs_item_key_to_cpu(leaf, &key, i);
6982 if (key.objectid == skip_objectid ||
6983 key.type != BTRFS_EXTENT_DATA_KEY)
6984 continue;
6985 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
6986 if (btrfs_file_extent_type(leaf, fi) ==
6987 BTRFS_FILE_EXTENT_INLINE)
6988 continue;
6989 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
6990 continue;
6991 if (!inode || inode->i_ino != key.objectid) {
6992 iput(inode);
6993 inode = btrfs_ilookup(target_root->fs_info->sb,
6994 key.objectid, target_root, 1);
6995 }
6996 if (!inode) {
6997 skip_objectid = key.objectid;
6998 continue;
6999 }
7000 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
7001
7002 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
7003 key.offset + num_bytes - 1, 0, &cached_state,
7004 GFP_NOFS);
7005 btrfs_drop_extent_cache(inode, key.offset,
7006 key.offset + num_bytes - 1, 1);
7007 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
7008 key.offset + num_bytes - 1, &cached_state,
7009 GFP_NOFS);
7010 cond_resched();
7011 }
7012 iput(inode);
7013 return 0;
7014}
7015
7016static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
7017 struct btrfs_root *root,
7018 struct extent_buffer *leaf,
7019 struct btrfs_block_group_cache *group,
7020 struct inode *reloc_inode)
7021{
7022 struct btrfs_key key;
7023 struct btrfs_key extent_key;
7024 struct btrfs_file_extent_item *fi;
7025 struct btrfs_leaf_ref *ref;
7026 struct disk_extent *new_extent;
7027 u64 bytenr;
7028 u64 num_bytes;
7029 u32 nritems;
7030 u32 i;
7031 int ext_index;
7032 int nr_extent;
7033 int ret;
7034
7035 new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
7036 BUG_ON(!new_extent);
7037
7038 ref = btrfs_lookup_leaf_ref(root, leaf->start);
7039 BUG_ON(!ref);
7040
7041 ext_index = -1;
7042 nritems = btrfs_header_nritems(leaf);
7043 for (i = 0; i < nritems; i++) {
7044 btrfs_item_key_to_cpu(leaf, &key, i);
7045 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
7046 continue;
7047 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
7048 if (btrfs_file_extent_type(leaf, fi) ==
7049 BTRFS_FILE_EXTENT_INLINE)
7050 continue;
7051 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7052 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
7053 if (bytenr == 0)
7054 continue;
7055
7056 ext_index++;
7057 if (bytenr >= group->key.objectid + group->key.offset ||
7058 bytenr + num_bytes <= group->key.objectid)
7059 continue;
7060
7061 extent_key.objectid = bytenr;
7062 extent_key.offset = num_bytes;
7063 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
7064 nr_extent = 1;
7065 ret = get_new_locations(reloc_inode, &extent_key,
7066 group->key.objectid, 1,
7067 &new_extent, &nr_extent);
7068 if (ret > 0)
7069 continue;
7070 BUG_ON(ret < 0);
7071
7072 BUG_ON(ref->extents[ext_index].bytenr != bytenr);
7073 BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
7074 ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
7075 ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
7076
7077 btrfs_set_file_extent_disk_bytenr(leaf, fi,
7078 new_extent->disk_bytenr);
7079 btrfs_set_file_extent_disk_num_bytes(leaf, fi,
7080 new_extent->disk_num_bytes);
7081 btrfs_mark_buffer_dirty(leaf);
7082
7083 ret = btrfs_inc_extent_ref(trans, root,
7084 new_extent->disk_bytenr,
7085 new_extent->disk_num_bytes,
7086 leaf->start,
7087 root->root_key.objectid,
7088 trans->transid, key.objectid);
7089 BUG_ON(ret);
7090
7091 ret = btrfs_free_extent(trans, root,
7092 bytenr, num_bytes, leaf->start,
7093 btrfs_header_owner(leaf),
7094 btrfs_header_generation(leaf),
7095 key.objectid, 0);
7096 BUG_ON(ret);
7097 cond_resched();
7098 }
7099 kfree(new_extent);
7100 BUG_ON(ext_index + 1 != ref->nritems);
7101 btrfs_free_leaf_ref(root, ref);
7102 return 0;
7103}
7104
7105int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
7106 struct btrfs_root *root)
7107{
7108 struct btrfs_root *reloc_root;
7109 int ret;
7110
7111 if (root->reloc_root) {
7112 reloc_root = root->reloc_root;
7113 root->reloc_root = NULL;
7114 list_add(&reloc_root->dead_list,
7115 &root->fs_info->dead_reloc_roots);
7116
7117 btrfs_set_root_bytenr(&reloc_root->root_item,
7118 reloc_root->node->start);
7119 btrfs_set_root_level(&root->root_item,
7120 btrfs_header_level(reloc_root->node));
7121 memset(&reloc_root->root_item.drop_progress, 0,
7122 sizeof(struct btrfs_disk_key));
7123 reloc_root->root_item.drop_level = 0;
7124
7125 ret = btrfs_update_root(trans, root->fs_info->tree_root,
7126 &reloc_root->root_key,
7127 &reloc_root->root_item);
7128 BUG_ON(ret);
7129 }
7130 return 0;
7131}
7132
7133int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
7134{
7135 struct btrfs_trans_handle *trans;
7136 struct btrfs_root *reloc_root;
7137 struct btrfs_root *prev_root = NULL;
7138 struct list_head dead_roots;
7139 int ret;
7140 unsigned long nr;
7141
7142 INIT_LIST_HEAD(&dead_roots);
7143 list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
7144
7145 while (!list_empty(&dead_roots)) {
7146 reloc_root = list_entry(dead_roots.prev,
7147 struct btrfs_root, dead_list);
7148 list_del_init(&reloc_root->dead_list);
7149
7150 BUG_ON(reloc_root->commit_root != NULL);
7151 while (1) {
7152 trans = btrfs_join_transaction(root, 1);
7153 BUG_ON(!trans);
7154
7155 mutex_lock(&root->fs_info->drop_mutex);
7156 ret = btrfs_drop_snapshot(trans, reloc_root);
7157 if (ret != -EAGAIN)
7158 break;
7159 mutex_unlock(&root->fs_info->drop_mutex);
7160
7161 nr = trans->blocks_used;
7162 ret = btrfs_end_transaction(trans, root);
7163 BUG_ON(ret);
7164 btrfs_btree_balance_dirty(root, nr);
7165 }
7166
7167 free_extent_buffer(reloc_root->node);
7168
7169 ret = btrfs_del_root(trans, root->fs_info->tree_root,
7170 &reloc_root->root_key);
7171 BUG_ON(ret);
7172 mutex_unlock(&root->fs_info->drop_mutex);
7173
7174 nr = trans->blocks_used;
7175 ret = btrfs_end_transaction(trans, root);
7176 BUG_ON(ret);
7177 btrfs_btree_balance_dirty(root, nr);
7178
7179 kfree(prev_root);
7180 prev_root = reloc_root;
7181 }
7182 if (prev_root) {
7183 btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
7184 kfree(prev_root);
7185 }
7186 return 0;
7187}
7188
7189int btrfs_add_dead_reloc_root(struct btrfs_root *root)
7190{
7191 list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
7192 return 0;
7193}
7194
7195int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
7196{
7197 struct btrfs_root *reloc_root;
7198 struct btrfs_trans_handle *trans;
7199 struct btrfs_key location;
7200 int found;
7201 int ret;
7202
7203 mutex_lock(&root->fs_info->tree_reloc_mutex);
7204 ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
7205 BUG_ON(ret);
7206 found = !list_empty(&root->fs_info->dead_reloc_roots);
7207 mutex_unlock(&root->fs_info->tree_reloc_mutex);
7208
7209 if (found) {
7210 trans = btrfs_start_transaction(root, 1);
7211 BUG_ON(!trans);
7212 ret = btrfs_commit_transaction(trans, root);
7213 BUG_ON(ret);
7214 }
7215
7216 location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
7217 location.offset = (u64)-1;
7218 location.type = BTRFS_ROOT_ITEM_KEY;
7219
7220 reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
7221 BUG_ON(!reloc_root);
7222 btrfs_orphan_cleanup(reloc_root);
7223 return 0;
7224}
7225
7226static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
7227 struct btrfs_root *root)
7228{
7229 struct btrfs_root *reloc_root;
7230 struct extent_buffer *eb;
7231 struct btrfs_root_item *root_item;
7232 struct btrfs_key root_key;
7233 int ret;
7234
7235 BUG_ON(!root->ref_cows);
7236 if (root->reloc_root)
7237 return 0;
7238
7239 root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
7240 BUG_ON(!root_item);
7241
7242 ret = btrfs_copy_root(trans, root, root->commit_root,
7243 &eb, BTRFS_TREE_RELOC_OBJECTID);
7244 BUG_ON(ret);
7245
7246 root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
7247 root_key.offset = root->root_key.objectid;
7248 root_key.type = BTRFS_ROOT_ITEM_KEY;
7249
7250 memcpy(root_item, &root->root_item, sizeof(root_item));
7251 btrfs_set_root_refs(root_item, 0);
7252 btrfs_set_root_bytenr(root_item, eb->start);
7253 btrfs_set_root_level(root_item, btrfs_header_level(eb));
7254 btrfs_set_root_generation(root_item, trans->transid);
7255
7256 btrfs_tree_unlock(eb);
7257 free_extent_buffer(eb);
7258
7259 ret = btrfs_insert_root(trans, root->fs_info->tree_root,
7260 &root_key, root_item);
7261 BUG_ON(ret);
7262 kfree(root_item);
7263
7264 reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
7265 &root_key);
7266 BUG_ON(!reloc_root);
7267 reloc_root->last_trans = trans->transid;
7268 reloc_root->commit_root = NULL;
7269 reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
7270
7271 root->reloc_root = reloc_root;
7272 return 0;
7273}
7274
7275/*
7276 * Core function of space balance.
7277 *
7278 * The idea is using reloc trees to relocate tree blocks in reference
7279 * counted roots. There is one reloc tree for each subvol, and all
7280 * reloc trees share same root key objectid. Reloc trees are snapshots
7281 * of the latest committed roots of subvols (root->commit_root).
7282 *
7283 * To relocate a tree block referenced by a subvol, there are two steps.
7284 * COW the block through subvol's reloc tree, then update block pointer
7285 * in the subvol to point to the new block. Since all reloc trees share
7286 * same root key objectid, doing special handing for tree blocks owned
7287 * by them is easy. Once a tree block has been COWed in one reloc tree,
7288 * we can use the resulting new block directly when the same block is
7289 * required to COW again through other reloc trees. By this way, relocated
7290 * tree blocks are shared between reloc trees, so they are also shared
7291 * between subvols.
7292 */
7293static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
7294 struct btrfs_root *root,
7295 struct btrfs_path *path,
7296 struct btrfs_key *first_key,
7297 struct btrfs_ref_path *ref_path,
7298 struct btrfs_block_group_cache *group,
7299 struct inode *reloc_inode)
7300{
7301 struct btrfs_root *reloc_root;
7302 struct extent_buffer *eb = NULL;
7303 struct btrfs_key *keys;
7304 u64 *nodes;
7305 int level;
7306 int shared_level;
7307 int lowest_level = 0;
7308 int ret;
7309
7310 if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
7311 lowest_level = ref_path->owner_objectid;
7312
7313 if (!root->ref_cows) {
7314 path->lowest_level = lowest_level;
7315 ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
7316 BUG_ON(ret < 0);
7317 path->lowest_level = 0;
7318 btrfs_release_path(root, path);
7319 return 0;
7320 }
7321
7322 mutex_lock(&root->fs_info->tree_reloc_mutex);
7323 ret = init_reloc_tree(trans, root);
7324 BUG_ON(ret);
7325 reloc_root = root->reloc_root;
7326
7327 shared_level = ref_path->shared_level;
7328 ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
7329
7330 keys = ref_path->node_keys;
7331 nodes = ref_path->new_nodes;
7332 memset(&keys[shared_level + 1], 0,
7333 sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
7334 memset(&nodes[shared_level + 1], 0,
7335 sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
7336
7337 if (nodes[lowest_level] == 0) {
7338 path->lowest_level = lowest_level;
7339 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7340 0, 1);
7341 BUG_ON(ret);
7342 for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
7343 eb = path->nodes[level];
7344 if (!eb || eb == reloc_root->node)
7345 break;
7346 nodes[level] = eb->start;
7347 if (level == 0)
7348 btrfs_item_key_to_cpu(eb, &keys[level], 0);
7349 else
7350 btrfs_node_key_to_cpu(eb, &keys[level], 0);
7351 }
7352 if (nodes[0] &&
7353 ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7354 eb = path->nodes[0];
7355 ret = replace_extents_in_leaf(trans, reloc_root, eb,
7356 group, reloc_inode);
7357 BUG_ON(ret);
7358 }
7359 btrfs_release_path(reloc_root, path);
7360 } else {
7361 ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
7362 lowest_level);
7363 BUG_ON(ret);
7364 }
7365
7366 /*
7367 * replace tree blocks in the fs tree with tree blocks in
7368 * the reloc tree.
7369 */
7370 ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
7371 BUG_ON(ret < 0);
7372
7373 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7374 ret = btrfs_search_slot(trans, reloc_root, first_key, path,
7375 0, 0);
7376 BUG_ON(ret);
7377 extent_buffer_get(path->nodes[0]);
7378 eb = path->nodes[0];
7379 btrfs_release_path(reloc_root, path);
7380 ret = invalidate_extent_cache(reloc_root, eb, group, root);
7381 BUG_ON(ret);
7382 free_extent_buffer(eb);
7383 }
7384
7385 mutex_unlock(&root->fs_info->tree_reloc_mutex);
7386 path->lowest_level = 0;
7387 return 0;
7388}
7389
7390static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
7391 struct btrfs_root *root,
7392 struct btrfs_path *path,
7393 struct btrfs_key *first_key,
7394 struct btrfs_ref_path *ref_path)
7395{
7396 int ret;
7397
7398 ret = relocate_one_path(trans, root, path, first_key,
7399 ref_path, NULL, NULL);
7400 BUG_ON(ret);
7401
7402 return 0;
7403}
7404
7405static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
7406 struct btrfs_root *extent_root,
7407 struct btrfs_path *path,
7408 struct btrfs_key *extent_key)
7409{
7410 int ret;
7411
7412 ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
7413 if (ret)
7414 goto out;
7415 ret = btrfs_del_item(trans, extent_root, path);
7416out:
7417 btrfs_release_path(extent_root, path);
7418 return ret;
7419}
7420
7421static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
7422 struct btrfs_ref_path *ref_path)
7423{
7424 struct btrfs_key root_key;
7425
7426 root_key.objectid = ref_path->root_objectid;
7427 root_key.type = BTRFS_ROOT_ITEM_KEY;
7428 if (is_cowonly_root(ref_path->root_objectid))
7429 root_key.offset = 0;
7430 else
7431 root_key.offset = (u64)-1;
7432
7433 return btrfs_read_fs_root_no_name(fs_info, &root_key);
7434}
7435
7436static noinline int relocate_one_extent(struct btrfs_root *extent_root,
7437 struct btrfs_path *path,
7438 struct btrfs_key *extent_key,
7439 struct btrfs_block_group_cache *group,
7440 struct inode *reloc_inode, int pass)
7441{
7442 struct btrfs_trans_handle *trans;
7443 struct btrfs_root *found_root;
7444 struct btrfs_ref_path *ref_path = NULL;
7445 struct disk_extent *new_extents = NULL;
7446 int nr_extents = 0;
7447 int loops;
7448 int ret;
7449 int level;
7450 struct btrfs_key first_key;
7451 u64 prev_block = 0;
7452
7453
7454 trans = btrfs_start_transaction(extent_root, 1);
7455 BUG_ON(!trans);
7456
7457 if (extent_key->objectid == 0) {
7458 ret = del_extent_zero(trans, extent_root, path, extent_key);
7459 goto out;
7460 }
7461
7462 ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
7463 if (!ref_path) {
7464 ret = -ENOMEM;
7465 goto out;
7466 }
7467
7468 for (loops = 0; ; loops++) {
7469 if (loops == 0) {
7470 ret = btrfs_first_ref_path(trans, extent_root, ref_path,
7471 extent_key->objectid);
7472 } else {
7473 ret = btrfs_next_ref_path(trans, extent_root, ref_path);
7474 }
7475 if (ret < 0)
7476 goto out;
7477 if (ret > 0)
7478 break;
7479
7480 if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
7481 ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
7482 continue;
7483
7484 found_root = read_ref_root(extent_root->fs_info, ref_path);
7485 BUG_ON(!found_root);
7486 /*
7487 * for reference counted tree, only process reference paths
7488 * rooted at the latest committed root.
7489 */
7490 if (found_root->ref_cows &&
7491 ref_path->root_generation != found_root->root_key.offset)
7492 continue;
7493
7494 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7495 if (pass == 0) {
7496 /*
7497 * copy data extents to new locations
7498 */
7499 u64 group_start = group->key.objectid;
7500 ret = relocate_data_extent(reloc_inode,
7501 extent_key,
7502 group_start);
7503 if (ret < 0)
7504 goto out;
7505 break;
7506 }
7507 level = 0;
7508 } else {
7509 level = ref_path->owner_objectid;
7510 }
7511
7512 if (prev_block != ref_path->nodes[level]) {
7513 struct extent_buffer *eb;
7514 u64 block_start = ref_path->nodes[level];
7515 u64 block_size = btrfs_level_size(found_root, level);
7516
7517 eb = read_tree_block(found_root, block_start,
7518 block_size, 0);
7519 btrfs_tree_lock(eb);
7520 BUG_ON(level != btrfs_header_level(eb));
7521
7522 if (level == 0)
7523 btrfs_item_key_to_cpu(eb, &first_key, 0);
7524 else
7525 btrfs_node_key_to_cpu(eb, &first_key, 0);
7526
7527 btrfs_tree_unlock(eb);
7528 free_extent_buffer(eb);
7529 prev_block = block_start;
7530 }
7531
7532 mutex_lock(&extent_root->fs_info->trans_mutex);
7533 btrfs_record_root_in_trans(found_root);
7534 mutex_unlock(&extent_root->fs_info->trans_mutex);
7535 if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
7536 /*
7537 * try to update data extent references while
7538 * keeping metadata shared between snapshots.
7539 */
7540 if (pass == 1) {
7541 ret = relocate_one_path(trans, found_root,
7542 path, &first_key, ref_path,
7543 group, reloc_inode);
7544 if (ret < 0)
7545 goto out;
7546 continue;
7547 }
7548 /*
7549 * use fallback method to process the remaining
7550 * references.
7551 */
7552 if (!new_extents) {
7553 u64 group_start = group->key.objectid;
7554 new_extents = kmalloc(sizeof(*new_extents),
7555 GFP_NOFS);
7556 nr_extents = 1;
7557 ret = get_new_locations(reloc_inode,
7558 extent_key,
7559 group_start, 1,
7560 &new_extents,
7561 &nr_extents);
7562 if (ret)
7563 goto out;
7564 }
7565 ret = replace_one_extent(trans, found_root,
7566 path, extent_key,
7567 &first_key, ref_path,
7568 new_extents, nr_extents);
7569 } else {
7570 ret = relocate_tree_block(trans, found_root, path,
7571 &first_key, ref_path);
7572 }
7573 if (ret < 0)
7574 goto out;
7575 }
7576 ret = 0;
7577out:
7578 btrfs_end_transaction(trans, extent_root);
7579 kfree(new_extents);
7580 kfree(ref_path);
7581 return ret;
7582}
7583#endif
7584
7585static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) 6482static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7586{ 6483{
7587 u64 num_devices; 6484 u64 num_devices;
7588 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | 6485 u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
7589 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; 6486 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7590 6487
7591 num_devices = root->fs_info->fs_devices->rw_devices; 6488 /*
6489 * we add in the count of missing devices because we want
6490 * to make sure that any RAID levels on a degraded FS
6491 * continue to be honored.
6492 */
6493 num_devices = root->fs_info->fs_devices->rw_devices +
6494 root->fs_info->fs_devices->missing_devices;
6495
7592 if (num_devices == 1) { 6496 if (num_devices == 1) {
7593 stripped |= BTRFS_BLOCK_GROUP_DUP; 6497 stripped |= BTRFS_BLOCK_GROUP_DUP;
7594 stripped = flags & ~stripped; 6498 stripped = flags & ~stripped;
@@ -7636,13 +6540,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
7636 6540
7637 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6541 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7638 sinfo->bytes_may_use + sinfo->bytes_readonly + 6542 sinfo->bytes_may_use + sinfo->bytes_readonly +
7639 cache->reserved_pinned + num_bytes < sinfo->total_bytes) { 6543 cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
7640 sinfo->bytes_readonly += num_bytes; 6544 sinfo->bytes_readonly += num_bytes;
7641 sinfo->bytes_reserved += cache->reserved_pinned; 6545 sinfo->bytes_reserved += cache->reserved_pinned;
7642 cache->reserved_pinned = 0; 6546 cache->reserved_pinned = 0;
7643 cache->ro = 1; 6547 cache->ro = 1;
7644 ret = 0; 6548 ret = 0;
7645 } 6549 }
6550
7646 spin_unlock(&cache->lock); 6551 spin_unlock(&cache->lock);
7647 spin_unlock(&sinfo->lock); 6552 spin_unlock(&sinfo->lock);
7648 return ret; 6553 return ret;
@@ -7658,18 +6563,20 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
7658 6563
7659 BUG_ON(cache->ro); 6564 BUG_ON(cache->ro);
7660 6565
7661 trans = btrfs_join_transaction(root, 1); 6566 trans = btrfs_join_transaction(root);
7662 BUG_ON(IS_ERR(trans)); 6567 BUG_ON(IS_ERR(trans));
7663 6568
7664 alloc_flags = update_block_group_flags(root, cache->flags); 6569 alloc_flags = update_block_group_flags(root, cache->flags);
7665 if (alloc_flags != cache->flags) 6570 if (alloc_flags != cache->flags)
7666 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 6571 do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6572 CHUNK_ALLOC_FORCE);
7667 6573
7668 ret = set_block_group_ro(cache); 6574 ret = set_block_group_ro(cache);
7669 if (!ret) 6575 if (!ret)
7670 goto out; 6576 goto out;
7671 alloc_flags = get_alloc_profile(root, cache->space_info->flags); 6577 alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7672 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1); 6578 ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6579 CHUNK_ALLOC_FORCE);
7673 if (ret < 0) 6580 if (ret < 0)
7674 goto out; 6581 goto out;
7675 ret = set_block_group_ro(cache); 6582 ret = set_block_group_ro(cache);
@@ -7678,6 +6585,70 @@ out:
7678 return ret; 6585 return ret;
7679} 6586}
7680 6587
6588int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
6589 struct btrfs_root *root, u64 type)
6590{
6591 u64 alloc_flags = get_alloc_profile(root, type);
6592 return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
6593 CHUNK_ALLOC_FORCE);
6594}
6595
6596/*
6597 * helper to account the unused space of all the readonly block group in the
6598 * list. takes mirrors into account.
6599 */
6600static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
6601{
6602 struct btrfs_block_group_cache *block_group;
6603 u64 free_bytes = 0;
6604 int factor;
6605
6606 list_for_each_entry(block_group, groups_list, list) {
6607 spin_lock(&block_group->lock);
6608
6609 if (!block_group->ro) {
6610 spin_unlock(&block_group->lock);
6611 continue;
6612 }
6613
6614 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
6615 BTRFS_BLOCK_GROUP_RAID10 |
6616 BTRFS_BLOCK_GROUP_DUP))
6617 factor = 2;
6618 else
6619 factor = 1;
6620
6621 free_bytes += (block_group->key.offset -
6622 btrfs_block_group_used(&block_group->item)) *
6623 factor;
6624
6625 spin_unlock(&block_group->lock);
6626 }
6627
6628 return free_bytes;
6629}
6630
6631/*
6632 * helper to account the unused space of all the readonly block group in the
6633 * space_info. takes mirrors into account.
6634 */
6635u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
6636{
6637 int i;
6638 u64 free_bytes = 0;
6639
6640 spin_lock(&sinfo->lock);
6641
6642 for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
6643 if (!list_empty(&sinfo->block_groups[i]))
6644 free_bytes += __btrfs_get_ro_block_group_free_space(
6645 &sinfo->block_groups[i]);
6646
6647 spin_unlock(&sinfo->lock);
6648
6649 return free_bytes;
6650}
6651
7681int btrfs_set_block_group_rw(struct btrfs_root *root, 6652int btrfs_set_block_group_rw(struct btrfs_root *root,
7682 struct btrfs_block_group_cache *cache) 6653 struct btrfs_block_group_cache *cache)
7683{ 6654{
@@ -7758,7 +6729,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7758 mutex_lock(&root->fs_info->chunk_mutex); 6729 mutex_lock(&root->fs_info->chunk_mutex);
7759 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 6730 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7760 u64 min_free = btrfs_block_group_used(&block_group->item); 6731 u64 min_free = btrfs_block_group_used(&block_group->item);
7761 u64 dev_offset, max_avail; 6732 u64 dev_offset;
7762 6733
7763 /* 6734 /*
7764 * check to make sure we can actually find a chunk with enough 6735 * check to make sure we can actually find a chunk with enough
@@ -7766,7 +6737,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7766 */ 6737 */
7767 if (device->total_bytes > device->bytes_used + min_free) { 6738 if (device->total_bytes > device->bytes_used + min_free) {
7768 ret = find_free_dev_extent(NULL, device, min_free, 6739 ret = find_free_dev_extent(NULL, device, min_free,
7769 &dev_offset, &max_avail); 6740 &dev_offset, NULL);
7770 if (!ret) 6741 if (!ret)
7771 break; 6742 break;
7772 ret = -1; 6743 ret = -1;
@@ -7814,6 +6785,40 @@ out:
7814 return ret; 6785 return ret;
7815} 6786}
7816 6787
6788void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
6789{
6790 struct btrfs_block_group_cache *block_group;
6791 u64 last = 0;
6792
6793 while (1) {
6794 struct inode *inode;
6795
6796 block_group = btrfs_lookup_first_block_group(info, last);
6797 while (block_group) {
6798 spin_lock(&block_group->lock);
6799 if (block_group->iref)
6800 break;
6801 spin_unlock(&block_group->lock);
6802 block_group = next_block_group(info->tree_root,
6803 block_group);
6804 }
6805 if (!block_group) {
6806 if (last == 0)
6807 break;
6808 last = 0;
6809 continue;
6810 }
6811
6812 inode = block_group->inode;
6813 block_group->iref = 0;
6814 block_group->inode = NULL;
6815 spin_unlock(&block_group->lock);
6816 iput(inode);
6817 last = block_group->key.objectid + block_group->key.offset;
6818 btrfs_put_block_group(block_group);
6819 }
6820}
6821
7817int btrfs_free_block_groups(struct btrfs_fs_info *info) 6822int btrfs_free_block_groups(struct btrfs_fs_info *info)
7818{ 6823{
7819 struct btrfs_block_group_cache *block_group; 6824 struct btrfs_block_group_cache *block_group;
@@ -7845,6 +6850,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
7845 if (block_group->cached == BTRFS_CACHE_STARTED) 6850 if (block_group->cached == BTRFS_CACHE_STARTED)
7846 wait_block_group_cache_done(block_group); 6851 wait_block_group_cache_done(block_group);
7847 6852
6853 /*
6854 * We haven't cached this block group, which means we could
6855 * possibly have excluded extents on this block group.
6856 */
6857 if (block_group->cached == BTRFS_CACHE_NO)
6858 free_excluded_extents(info->extent_root, block_group);
6859
7848 btrfs_remove_free_space_cache(block_group); 6860 btrfs_remove_free_space_cache(block_group);
7849 btrfs_put_block_group(block_group); 6861 btrfs_put_block_group(block_group);
7850 6862
@@ -7897,6 +6909,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7897 struct btrfs_key key; 6909 struct btrfs_key key;
7898 struct btrfs_key found_key; 6910 struct btrfs_key found_key;
7899 struct extent_buffer *leaf; 6911 struct extent_buffer *leaf;
6912 int need_clear = 0;
6913 u64 cache_gen;
7900 6914
7901 root = info->extent_root; 6915 root = info->extent_root;
7902 key.objectid = 0; 6916 key.objectid = 0;
@@ -7905,6 +6919,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7905 path = btrfs_alloc_path(); 6919 path = btrfs_alloc_path();
7906 if (!path) 6920 if (!path)
7907 return -ENOMEM; 6921 return -ENOMEM;
6922 path->reada = 1;
6923
6924 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
6925 if (cache_gen != 0 &&
6926 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
6927 need_clear = 1;
6928 if (btrfs_test_opt(root, CLEAR_CACHE))
6929 need_clear = 1;
6930 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
6931 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7908 6932
7909 while (1) { 6933 while (1) {
7910 ret = find_first_block_group(root, path, &key); 6934 ret = find_first_block_group(root, path, &key);
@@ -7912,7 +6936,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7912 break; 6936 break;
7913 if (ret != 0) 6937 if (ret != 0)
7914 goto error; 6938 goto error;
7915
7916 leaf = path->nodes[0]; 6939 leaf = path->nodes[0];
7917 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 6940 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7918 cache = kzalloc(sizeof(*cache), GFP_NOFS); 6941 cache = kzalloc(sizeof(*cache), GFP_NOFS);
@@ -7920,21 +6943,22 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7920 ret = -ENOMEM; 6943 ret = -ENOMEM;
7921 goto error; 6944 goto error;
7922 } 6945 }
6946 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
6947 GFP_NOFS);
6948 if (!cache->free_space_ctl) {
6949 kfree(cache);
6950 ret = -ENOMEM;
6951 goto error;
6952 }
7923 6953
7924 atomic_set(&cache->count, 1); 6954 atomic_set(&cache->count, 1);
7925 spin_lock_init(&cache->lock); 6955 spin_lock_init(&cache->lock);
7926 spin_lock_init(&cache->tree_lock);
7927 cache->fs_info = info; 6956 cache->fs_info = info;
7928 INIT_LIST_HEAD(&cache->list); 6957 INIT_LIST_HEAD(&cache->list);
7929 INIT_LIST_HEAD(&cache->cluster_list); 6958 INIT_LIST_HEAD(&cache->cluster_list);
7930 6959
7931 /* 6960 if (need_clear)
7932 * we only want to have 32k of ram per block group for keeping 6961 cache->disk_cache_state = BTRFS_DC_CLEAR;
7933 * track of free space, and if we pass 1/2 of that we want to
7934 * start converting things over to using bitmaps
7935 */
7936 cache->extents_thresh = ((1024 * 32) / 2) /
7937 sizeof(struct btrfs_free_space);
7938 6962
7939 read_extent_buffer(leaf, &cache->item, 6963 read_extent_buffer(leaf, &cache->item,
7940 btrfs_item_ptr_offset(leaf, path->slots[0]), 6964 btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -7942,10 +6966,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7942 memcpy(&cache->key, &found_key, sizeof(found_key)); 6966 memcpy(&cache->key, &found_key, sizeof(found_key));
7943 6967
7944 key.objectid = found_key.objectid + found_key.offset; 6968 key.objectid = found_key.objectid + found_key.offset;
7945 btrfs_release_path(root, path); 6969 btrfs_release_path(path);
7946 cache->flags = btrfs_block_group_flags(&cache->item); 6970 cache->flags = btrfs_block_group_flags(&cache->item);
7947 cache->sectorsize = root->sectorsize; 6971 cache->sectorsize = root->sectorsize;
7948 6972
6973 btrfs_init_free_space_ctl(cache);
6974
6975 /*
6976 * We need to exclude the super stripes now so that the space
6977 * info has super bytes accounted for, otherwise we'll think
6978 * we have more space than we actually do.
6979 */
6980 exclude_super_stripes(root, cache);
6981
7949 /* 6982 /*
7950 * check for two cases, either we are full, and therefore 6983 * check for two cases, either we are full, and therefore
7951 * don't need to bother with the caching work since we won't 6984 * don't need to bother with the caching work since we won't
@@ -7954,12 +6987,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7954 * time, particularly in the full case. 6987 * time, particularly in the full case.
7955 */ 6988 */
7956 if (found_key.offset == btrfs_block_group_used(&cache->item)) { 6989 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7957 exclude_super_stripes(root, cache);
7958 cache->last_byte_to_unpin = (u64)-1; 6990 cache->last_byte_to_unpin = (u64)-1;
7959 cache->cached = BTRFS_CACHE_FINISHED; 6991 cache->cached = BTRFS_CACHE_FINISHED;
7960 free_excluded_extents(root, cache); 6992 free_excluded_extents(root, cache);
7961 } else if (btrfs_block_group_used(&cache->item) == 0) { 6993 } else if (btrfs_block_group_used(&cache->item) == 0) {
7962 exclude_super_stripes(root, cache);
7963 cache->last_byte_to_unpin = (u64)-1; 6994 cache->last_byte_to_unpin = (u64)-1;
7964 cache->cached = BTRFS_CACHE_FINISHED; 6995 cache->cached = BTRFS_CACHE_FINISHED;
7965 add_new_free_space(cache, root->fs_info, 6996 add_new_free_space(cache, root->fs_info,
@@ -8027,25 +7058,26 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8027 cache = kzalloc(sizeof(*cache), GFP_NOFS); 7058 cache = kzalloc(sizeof(*cache), GFP_NOFS);
8028 if (!cache) 7059 if (!cache)
8029 return -ENOMEM; 7060 return -ENOMEM;
7061 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7062 GFP_NOFS);
7063 if (!cache->free_space_ctl) {
7064 kfree(cache);
7065 return -ENOMEM;
7066 }
8030 7067
8031 cache->key.objectid = chunk_offset; 7068 cache->key.objectid = chunk_offset;
8032 cache->key.offset = size; 7069 cache->key.offset = size;
8033 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; 7070 cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8034 cache->sectorsize = root->sectorsize; 7071 cache->sectorsize = root->sectorsize;
7072 cache->fs_info = root->fs_info;
8035 7073
8036 /*
8037 * we only want to have 32k of ram per block group for keeping track
8038 * of free space, and if we pass 1/2 of that we want to start
8039 * converting things over to using bitmaps
8040 */
8041 cache->extents_thresh = ((1024 * 32) / 2) /
8042 sizeof(struct btrfs_free_space);
8043 atomic_set(&cache->count, 1); 7074 atomic_set(&cache->count, 1);
8044 spin_lock_init(&cache->lock); 7075 spin_lock_init(&cache->lock);
8045 spin_lock_init(&cache->tree_lock);
8046 INIT_LIST_HEAD(&cache->list); 7076 INIT_LIST_HEAD(&cache->list);
8047 INIT_LIST_HEAD(&cache->cluster_list); 7077 INIT_LIST_HEAD(&cache->cluster_list);
8048 7078
7079 btrfs_init_free_space_ctl(cache);
7080
8049 btrfs_set_block_group_used(&cache->item, bytes_used); 7081 btrfs_set_block_group_used(&cache->item, bytes_used);
8050 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); 7082 btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8051 cache->flags = type; 7083 cache->flags = type;
@@ -8088,8 +7120,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8088 struct btrfs_path *path; 7120 struct btrfs_path *path;
8089 struct btrfs_block_group_cache *block_group; 7121 struct btrfs_block_group_cache *block_group;
8090 struct btrfs_free_cluster *cluster; 7122 struct btrfs_free_cluster *cluster;
7123 struct btrfs_root *tree_root = root->fs_info->tree_root;
8091 struct btrfs_key key; 7124 struct btrfs_key key;
7125 struct inode *inode;
8092 int ret; 7126 int ret;
7127 int factor;
8093 7128
8094 root = root->fs_info->extent_root; 7129 root = root->fs_info->extent_root;
8095 7130
@@ -8097,7 +7132,19 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8097 BUG_ON(!block_group); 7132 BUG_ON(!block_group);
8098 BUG_ON(!block_group->ro); 7133 BUG_ON(!block_group->ro);
8099 7134
7135 /*
7136 * Free the reserved super bytes from this block group before
7137 * remove it.
7138 */
7139 free_excluded_extents(root, block_group);
7140
8100 memcpy(&key, &block_group->key, sizeof(key)); 7141 memcpy(&key, &block_group->key, sizeof(key));
7142 if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
7143 BTRFS_BLOCK_GROUP_RAID1 |
7144 BTRFS_BLOCK_GROUP_RAID10))
7145 factor = 2;
7146 else
7147 factor = 1;
8101 7148
8102 /* make sure this block group isn't part of an allocation cluster */ 7149 /* make sure this block group isn't part of an allocation cluster */
8103 cluster = &root->fs_info->data_alloc_cluster; 7150 cluster = &root->fs_info->data_alloc_cluster;
@@ -8117,6 +7164,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8117 path = btrfs_alloc_path(); 7164 path = btrfs_alloc_path();
8118 BUG_ON(!path); 7165 BUG_ON(!path);
8119 7166
7167 inode = lookup_free_space_inode(root, block_group, path);
7168 if (!IS_ERR(inode)) {
7169 btrfs_orphan_add(trans, inode);
7170 clear_nlink(inode);
7171 /* One for the block groups ref */
7172 spin_lock(&block_group->lock);
7173 if (block_group->iref) {
7174 block_group->iref = 0;
7175 block_group->inode = NULL;
7176 spin_unlock(&block_group->lock);
7177 iput(inode);
7178 } else {
7179 spin_unlock(&block_group->lock);
7180 }
7181 /* One for our lookup ref */
7182 iput(inode);
7183 }
7184
7185 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
7186 key.offset = block_group->key.objectid;
7187 key.type = 0;
7188
7189 ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
7190 if (ret < 0)
7191 goto out;
7192 if (ret > 0)
7193 btrfs_release_path(path);
7194 if (ret == 0) {
7195 ret = btrfs_del_item(trans, tree_root, path);
7196 if (ret)
7197 goto out;
7198 btrfs_release_path(path);
7199 }
7200
8120 spin_lock(&root->fs_info->block_group_cache_lock); 7201 spin_lock(&root->fs_info->block_group_cache_lock);
8121 rb_erase(&block_group->cache_node, 7202 rb_erase(&block_group->cache_node,
8122 &root->fs_info->block_group_cache_tree); 7203 &root->fs_info->block_group_cache_tree);
@@ -8138,8 +7219,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8138 spin_lock(&block_group->space_info->lock); 7219 spin_lock(&block_group->space_info->lock);
8139 block_group->space_info->total_bytes -= block_group->key.offset; 7220 block_group->space_info->total_bytes -= block_group->key.offset;
8140 block_group->space_info->bytes_readonly -= block_group->key.offset; 7221 block_group->space_info->bytes_readonly -= block_group->key.offset;
7222 block_group->space_info->disk_total -= block_group->key.offset * factor;
8141 spin_unlock(&block_group->space_info->lock); 7223 spin_unlock(&block_group->space_info->lock);
8142 7224
7225 memcpy(&key, &block_group->key, sizeof(key));
7226
8143 btrfs_clear_space_info_full(root->fs_info); 7227 btrfs_clear_space_info_full(root->fs_info);
8144 7228
8145 btrfs_put_block_group(block_group); 7229 btrfs_put_block_group(block_group);
@@ -8156,3 +7240,100 @@ out:
8156 btrfs_free_path(path); 7240 btrfs_free_path(path);
8157 return ret; 7241 return ret;
8158} 7242}
7243
7244int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7245{
7246 struct btrfs_space_info *space_info;
7247 struct btrfs_super_block *disk_super;
7248 u64 features;
7249 u64 flags;
7250 int mixed = 0;
7251 int ret;
7252
7253 disk_super = &fs_info->super_copy;
7254 if (!btrfs_super_root(disk_super))
7255 return 1;
7256
7257 features = btrfs_super_incompat_flags(disk_super);
7258 if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
7259 mixed = 1;
7260
7261 flags = BTRFS_BLOCK_GROUP_SYSTEM;
7262 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7263 if (ret)
7264 goto out;
7265
7266 if (mixed) {
7267 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
7268 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7269 } else {
7270 flags = BTRFS_BLOCK_GROUP_METADATA;
7271 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7272 if (ret)
7273 goto out;
7274
7275 flags = BTRFS_BLOCK_GROUP_DATA;
7276 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
7277 }
7278out:
7279 return ret;
7280}
7281
7282int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
7283{
7284 return unpin_extent_range(root, start, end);
7285}
7286
7287int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
7288 u64 num_bytes, u64 *actual_bytes)
7289{
7290 return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
7291}
7292
7293int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
7294{
7295 struct btrfs_fs_info *fs_info = root->fs_info;
7296 struct btrfs_block_group_cache *cache = NULL;
7297 u64 group_trimmed;
7298 u64 start;
7299 u64 end;
7300 u64 trimmed = 0;
7301 int ret = 0;
7302
7303 cache = btrfs_lookup_block_group(fs_info, range->start);
7304
7305 while (cache) {
7306 if (cache->key.objectid >= (range->start + range->len)) {
7307 btrfs_put_block_group(cache);
7308 break;
7309 }
7310
7311 start = max(range->start, cache->key.objectid);
7312 end = min(range->start + range->len,
7313 cache->key.objectid + cache->key.offset);
7314
7315 if (end - start >= range->minlen) {
7316 if (!block_group_cache_done(cache)) {
7317 ret = cache_block_group(cache, NULL, root, 0);
7318 if (!ret)
7319 wait_block_group_cache_done(cache);
7320 }
7321 ret = btrfs_trim_block_group(cache,
7322 &group_trimmed,
7323 start,
7324 end,
7325 range->minlen);
7326
7327 trimmed += group_trimmed;
7328 if (ret) {
7329 btrfs_put_block_group(cache);
7330 break;
7331 }
7332 }
7333
7334 cache = next_block_group(fs_info->tree_root, cache);
7335 }
7336
7337 range->len = trimmed;
7338 return ret;
7339}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d74e6af9b53a..7055d11c1efd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -10,6 +10,8 @@
10#include <linux/swap.h> 10#include <linux/swap.h>
11#include <linux/writeback.h> 11#include <linux/writeback.h>
12#include <linux/pagevec.h> 12#include <linux/pagevec.h>
13#include <linux/prefetch.h>
14#include <linux/cleancache.h>
13#include "extent_io.h" 15#include "extent_io.h"
14#include "extent_map.h" 16#include "extent_map.h"
15#include "compat.h" 17#include "compat.h"
@@ -101,10 +103,10 @@ void extent_io_exit(void)
101} 103}
102 104
103void extent_io_tree_init(struct extent_io_tree *tree, 105void extent_io_tree_init(struct extent_io_tree *tree,
104 struct address_space *mapping, gfp_t mask) 106 struct address_space *mapping)
105{ 107{
106 tree->state = RB_ROOT; 108 tree->state = RB_ROOT;
107 tree->buffer = RB_ROOT; 109 INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
108 tree->ops = NULL; 110 tree->ops = NULL;
109 tree->dirty_bytes = 0; 111 tree->dirty_bytes = 0;
110 spin_lock_init(&tree->lock); 112 spin_lock_init(&tree->lock);
@@ -235,50 +237,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
235 return ret; 237 return ret;
236} 238}
237 239
238static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
239 u64 offset, struct rb_node *node)
240{
241 struct rb_root *root = &tree->buffer;
242 struct rb_node **p = &root->rb_node;
243 struct rb_node *parent = NULL;
244 struct extent_buffer *eb;
245
246 while (*p) {
247 parent = *p;
248 eb = rb_entry(parent, struct extent_buffer, rb_node);
249
250 if (offset < eb->start)
251 p = &(*p)->rb_left;
252 else if (offset > eb->start)
253 p = &(*p)->rb_right;
254 else
255 return eb;
256 }
257
258 rb_link_node(node, parent, p);
259 rb_insert_color(node, root);
260 return NULL;
261}
262
263static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
264 u64 offset)
265{
266 struct rb_root *root = &tree->buffer;
267 struct rb_node *n = root->rb_node;
268 struct extent_buffer *eb;
269
270 while (n) {
271 eb = rb_entry(n, struct extent_buffer, rb_node);
272 if (offset < eb->start)
273 n = n->rb_left;
274 else if (offset > eb->start)
275 n = n->rb_right;
276 else
277 return eb;
278 }
279 return NULL;
280}
281
282static void merge_cb(struct extent_io_tree *tree, struct extent_state *new, 240static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
283 struct extent_state *other) 241 struct extent_state *other)
284{ 242{
@@ -483,6 +441,15 @@ static int clear_state_bit(struct extent_io_tree *tree,
483 return ret; 441 return ret;
484} 442}
485 443
444static struct extent_state *
445alloc_extent_state_atomic(struct extent_state *prealloc)
446{
447 if (!prealloc)
448 prealloc = alloc_extent_state(GFP_ATOMIC);
449
450 return prealloc;
451}
452
486/* 453/*
487 * clear some bits on a range in the tree. This may require splitting 454 * clear some bits on a range in the tree. This may require splitting
488 * or inserting elements in the tree, so the gfp mask is used to 455 * or inserting elements in the tree, so the gfp mask is used to
@@ -573,8 +540,8 @@ hit_next:
573 */ 540 */
574 541
575 if (state->start < start) { 542 if (state->start < start) {
576 if (!prealloc) 543 prealloc = alloc_extent_state_atomic(prealloc);
577 prealloc = alloc_extent_state(GFP_ATOMIC); 544 BUG_ON(!prealloc);
578 err = split_state(tree, state, prealloc, start); 545 err = split_state(tree, state, prealloc, start);
579 BUG_ON(err == -EEXIST); 546 BUG_ON(err == -EEXIST);
580 prealloc = NULL; 547 prealloc = NULL;
@@ -595,8 +562,8 @@ hit_next:
595 * on the first half 562 * on the first half
596 */ 563 */
597 if (state->start <= end && state->end > end) { 564 if (state->start <= end && state->end > end) {
598 if (!prealloc) 565 prealloc = alloc_extent_state_atomic(prealloc);
599 prealloc = alloc_extent_state(GFP_ATOMIC); 566 BUG_ON(!prealloc);
600 err = split_state(tree, state, prealloc, end + 1); 567 err = split_state(tree, state, prealloc, end + 1);
601 BUG_ON(err == -EEXIST); 568 BUG_ON(err == -EEXIST);
602 if (wake) 569 if (wake)
@@ -734,6 +701,15 @@ static void cache_state(struct extent_state *state,
734 } 701 }
735} 702}
736 703
704static void uncache_state(struct extent_state **cached_ptr)
705{
706 if (cached_ptr && (*cached_ptr)) {
707 struct extent_state *state = *cached_ptr;
708 *cached_ptr = NULL;
709 free_extent_state(state);
710 }
711}
712
737/* 713/*
738 * set some bits on a range in the tree. This may require allocations or 714 * set some bits on a range in the tree. This may require allocations or
739 * sleeping, so the gfp mask is used to indicate what is allowed. 715 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -760,8 +736,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
760again: 736again:
761 if (!prealloc && (mask & __GFP_WAIT)) { 737 if (!prealloc && (mask & __GFP_WAIT)) {
762 prealloc = alloc_extent_state(mask); 738 prealloc = alloc_extent_state(mask);
763 if (!prealloc) 739 BUG_ON(!prealloc);
764 return -ENOMEM;
765 } 740 }
766 741
767 spin_lock(&tree->lock); 742 spin_lock(&tree->lock);
@@ -778,6 +753,8 @@ again:
778 */ 753 */
779 node = tree_search(tree, start); 754 node = tree_search(tree, start);
780 if (!node) { 755 if (!node) {
756 prealloc = alloc_extent_state_atomic(prealloc);
757 BUG_ON(!prealloc);
781 err = insert_state(tree, prealloc, start, end, &bits); 758 err = insert_state(tree, prealloc, start, end, &bits);
782 prealloc = NULL; 759 prealloc = NULL;
783 BUG_ON(err == -EEXIST); 760 BUG_ON(err == -EEXIST);
@@ -806,20 +783,18 @@ hit_next:
806 if (err) 783 if (err)
807 goto out; 784 goto out;
808 785
786 next_node = rb_next(node);
809 cache_state(state, cached_state); 787 cache_state(state, cached_state);
810 merge_state(tree, state); 788 merge_state(tree, state);
811 if (last_end == (u64)-1) 789 if (last_end == (u64)-1)
812 goto out; 790 goto out;
813 791
814 start = last_end + 1; 792 start = last_end + 1;
815 if (start < end && prealloc && !need_resched()) { 793 if (next_node && start < end && prealloc && !need_resched()) {
816 next_node = rb_next(node); 794 state = rb_entry(next_node, struct extent_state,
817 if (next_node) { 795 rb_node);
818 state = rb_entry(next_node, struct extent_state, 796 if (state->start == start)
819 rb_node); 797 goto hit_next;
820 if (state->start == start)
821 goto hit_next;
822 }
823 } 798 }
824 goto search_again; 799 goto search_again;
825 } 800 }
@@ -846,6 +821,9 @@ hit_next:
846 err = -EEXIST; 821 err = -EEXIST;
847 goto out; 822 goto out;
848 } 823 }
824
825 prealloc = alloc_extent_state_atomic(prealloc);
826 BUG_ON(!prealloc);
849 err = split_state(tree, state, prealloc, start); 827 err = split_state(tree, state, prealloc, start);
850 BUG_ON(err == -EEXIST); 828 BUG_ON(err == -EEXIST);
851 prealloc = NULL; 829 prealloc = NULL;
@@ -876,14 +854,25 @@ hit_next:
876 this_end = end; 854 this_end = end;
877 else 855 else
878 this_end = last_start - 1; 856 this_end = last_start - 1;
857
858 prealloc = alloc_extent_state_atomic(prealloc);
859 BUG_ON(!prealloc);
860
861 /*
862 * Avoid to free 'prealloc' if it can be merged with
863 * the later extent.
864 */
865 atomic_inc(&prealloc->refs);
879 err = insert_state(tree, prealloc, start, this_end, 866 err = insert_state(tree, prealloc, start, this_end,
880 &bits); 867 &bits);
881 BUG_ON(err == -EEXIST); 868 BUG_ON(err == -EEXIST);
882 if (err) { 869 if (err) {
870 free_extent_state(prealloc);
883 prealloc = NULL; 871 prealloc = NULL;
884 goto out; 872 goto out;
885 } 873 }
886 cache_state(prealloc, cached_state); 874 cache_state(prealloc, cached_state);
875 free_extent_state(prealloc);
887 prealloc = NULL; 876 prealloc = NULL;
888 start = this_end + 1; 877 start = this_end + 1;
889 goto search_again; 878 goto search_again;
@@ -900,6 +889,9 @@ hit_next:
900 err = -EEXIST; 889 err = -EEXIST;
901 goto out; 890 goto out;
902 } 891 }
892
893 prealloc = alloc_extent_state_atomic(prealloc);
894 BUG_ON(!prealloc);
903 err = split_state(tree, state, prealloc, end + 1); 895 err = split_state(tree, state, prealloc, end + 1);
904 BUG_ON(err == -EEXIST); 896 BUG_ON(err == -EEXIST);
905 897
@@ -976,18 +968,11 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
976 NULL, mask); 968 NULL, mask);
977} 969}
978 970
979static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
980 gfp_t mask)
981{
982 return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
983 NULL, mask);
984}
985
986int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 971int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
987 gfp_t mask) 972 struct extent_state **cached_state, gfp_t mask)
988{ 973{
989 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, 974 return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
990 NULL, mask); 975 NULL, cached_state, mask);
991} 976}
992 977
993static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 978static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -998,11 +983,6 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
998 cached_state, mask); 983 cached_state, mask);
999} 984}
1000 985
1001int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1002{
1003 return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
1004}
1005
1006/* 986/*
1007 * either insert or lock state struct between start and end use mask to tell 987 * either insert or lock state struct between start and end use mask to tell
1008 * us if waiting is desired. 988 * us if waiting is desired.
@@ -1056,33 +1036,13 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1056 mask); 1036 mask);
1057} 1037}
1058 1038
1059int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, 1039int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1060 gfp_t mask)
1061{ 1040{
1062 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, 1041 return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1063 mask); 1042 mask);
1064} 1043}
1065 1044
1066/* 1045/*
1067 * helper function to set pages and extents in the tree dirty
1068 */
1069int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
1070{
1071 unsigned long index = start >> PAGE_CACHE_SHIFT;
1072 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1073 struct page *page;
1074
1075 while (index <= end_index) {
1076 page = find_get_page(tree->mapping, index);
1077 BUG_ON(!page);
1078 __set_page_dirty_nobuffers(page);
1079 page_cache_release(page);
1080 index++;
1081 }
1082 return 0;
1083}
1084
1085/*
1086 * helper function to set both pages and extents in the tree writeback 1046 * helper function to set both pages and extents in the tree writeback
1087 */ 1047 */
1088static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1048static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1477,12 +1437,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
1477 */ 1437 */
1478u64 count_range_bits(struct extent_io_tree *tree, 1438u64 count_range_bits(struct extent_io_tree *tree,
1479 u64 *start, u64 search_end, u64 max_bytes, 1439 u64 *start, u64 search_end, u64 max_bytes,
1480 unsigned long bits) 1440 unsigned long bits, int contig)
1481{ 1441{
1482 struct rb_node *node; 1442 struct rb_node *node;
1483 struct extent_state *state; 1443 struct extent_state *state;
1484 u64 cur_start = *start; 1444 u64 cur_start = *start;
1485 u64 total_bytes = 0; 1445 u64 total_bytes = 0;
1446 u64 last = 0;
1486 int found = 0; 1447 int found = 0;
1487 1448
1488 if (search_end <= cur_start) { 1449 if (search_end <= cur_start) {
@@ -1507,15 +1468,20 @@ u64 count_range_bits(struct extent_io_tree *tree,
1507 state = rb_entry(node, struct extent_state, rb_node); 1468 state = rb_entry(node, struct extent_state, rb_node);
1508 if (state->start > search_end) 1469 if (state->start > search_end)
1509 break; 1470 break;
1510 if (state->end >= cur_start && (state->state & bits)) { 1471 if (contig && found && state->start > last + 1)
1472 break;
1473 if (state->end >= cur_start && (state->state & bits) == bits) {
1511 total_bytes += min(search_end, state->end) + 1 - 1474 total_bytes += min(search_end, state->end) + 1 -
1512 max(cur_start, state->start); 1475 max(cur_start, state->start);
1513 if (total_bytes >= max_bytes) 1476 if (total_bytes >= max_bytes)
1514 break; 1477 break;
1515 if (!found) { 1478 if (!found) {
1516 *start = state->start; 1479 *start = max(cur_start, state->start);
1517 found = 1; 1480 found = 1;
1518 } 1481 }
1482 last = state->end;
1483 } else if (contig && found) {
1484 break;
1519 } 1485 }
1520 node = rb_next(node); 1486 node = rb_next(node);
1521 if (!node) 1487 if (!node)
@@ -1773,6 +1739,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1773 1739
1774 do { 1740 do {
1775 struct page *page = bvec->bv_page; 1741 struct page *page = bvec->bv_page;
1742 struct extent_state *cached = NULL;
1743 struct extent_state *state;
1744
1776 tree = &BTRFS_I(page->mapping->host)->io_tree; 1745 tree = &BTRFS_I(page->mapping->host)->io_tree;
1777 1746
1778 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 1747 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1787,9 +1756,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1787 if (++bvec <= bvec_end) 1756 if (++bvec <= bvec_end)
1788 prefetchw(&bvec->bv_page->flags); 1757 prefetchw(&bvec->bv_page->flags);
1789 1758
1759 spin_lock(&tree->lock);
1760 state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
1761 if (state && state->start == start) {
1762 /*
1763 * take a reference on the state, unlock will drop
1764 * the ref
1765 */
1766 cache_state(state, &cached);
1767 }
1768 spin_unlock(&tree->lock);
1769
1790 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1770 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1791 ret = tree->ops->readpage_end_io_hook(page, start, end, 1771 ret = tree->ops->readpage_end_io_hook(page, start, end,
1792 NULL); 1772 state);
1793 if (ret) 1773 if (ret)
1794 uptodate = 0; 1774 uptodate = 0;
1795 } 1775 }
@@ -1802,15 +1782,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1802 test_bit(BIO_UPTODATE, &bio->bi_flags); 1782 test_bit(BIO_UPTODATE, &bio->bi_flags);
1803 if (err) 1783 if (err)
1804 uptodate = 0; 1784 uptodate = 0;
1785 uncache_state(&cached);
1805 continue; 1786 continue;
1806 } 1787 }
1807 } 1788 }
1808 1789
1809 if (uptodate) { 1790 if (uptodate) {
1810 set_extent_uptodate(tree, start, end, 1791 set_extent_uptodate(tree, start, end, &cached,
1811 GFP_ATOMIC); 1792 GFP_ATOMIC);
1812 } 1793 }
1813 unlock_extent(tree, start, end, GFP_ATOMIC); 1794 unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
1814 1795
1815 if (whole_page) { 1796 if (whole_page) {
1816 if (uptodate) { 1797 if (uptodate) {
@@ -1834,47 +1815,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1834 bio_put(bio); 1815 bio_put(bio);
1835} 1816}
1836 1817
1837/* 1818struct bio *
1838 * IO done from prepare_write is pretty simple, we just unlock 1819btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1839 * the structs in the extent tree when done, and set the uptodate bits 1820 gfp_t gfp_flags)
1840 * as appropriate.
1841 */
1842static void end_bio_extent_preparewrite(struct bio *bio, int err)
1843{
1844 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1845 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1846 struct extent_io_tree *tree;
1847 u64 start;
1848 u64 end;
1849
1850 do {
1851 struct page *page = bvec->bv_page;
1852 tree = &BTRFS_I(page->mapping->host)->io_tree;
1853
1854 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1855 bvec->bv_offset;
1856 end = start + bvec->bv_len - 1;
1857
1858 if (--bvec >= bio->bi_io_vec)
1859 prefetchw(&bvec->bv_page->flags);
1860
1861 if (uptodate) {
1862 set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1863 } else {
1864 ClearPageUptodate(page);
1865 SetPageError(page);
1866 }
1867
1868 unlock_extent(tree, start, end, GFP_ATOMIC);
1869
1870 } while (bvec >= bio->bi_io_vec);
1871
1872 bio_put(bio);
1873}
1874
1875static struct bio *
1876extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1877 gfp_t gfp_flags)
1878{ 1821{
1879 struct bio *bio; 1822 struct bio *bio;
1880 1823
@@ -1901,17 +1844,15 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1901 struct page *page = bvec->bv_page; 1844 struct page *page = bvec->bv_page;
1902 struct extent_io_tree *tree = bio->bi_private; 1845 struct extent_io_tree *tree = bio->bi_private;
1903 u64 start; 1846 u64 start;
1904 u64 end;
1905 1847
1906 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1848 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1907 end = start + bvec->bv_len - 1;
1908 1849
1909 bio->bi_private = NULL; 1850 bio->bi_private = NULL;
1910 1851
1911 bio_get(bio); 1852 bio_get(bio);
1912 1853
1913 if (tree->ops && tree->ops->submit_bio_hook) 1854 if (tree->ops && tree->ops->submit_bio_hook)
1914 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1855 ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1915 mirror_num, bio_flags, start); 1856 mirror_num, bio_flags, start);
1916 else 1857 else
1917 submit_bio(rw, bio); 1858 submit_bio(rw, bio);
@@ -1965,7 +1906,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1965 else 1906 else
1966 nr = bio_get_nr_vecs(bdev); 1907 nr = bio_get_nr_vecs(bdev);
1967 1908
1968 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1909 bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1910 if (!bio)
1911 return -ENOMEM;
1969 1912
1970 bio_add_page(bio, page, page_size, offset); 1913 bio_add_page(bio, page, page_size, offset);
1971 bio->bi_end_io = end_io_func; 1914 bio->bi_end_io = end_io_func;
@@ -1990,6 +1933,7 @@ void set_page_extent_mapped(struct page *page)
1990 1933
1991static void set_page_extent_head(struct page *page, unsigned long len) 1934static void set_page_extent_head(struct page *page, unsigned long len)
1992{ 1935{
1936 WARN_ON(!PagePrivate(page));
1993 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); 1937 set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1994} 1938}
1995 1939
@@ -2019,7 +1963,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2019 struct btrfs_ordered_extent *ordered; 1963 struct btrfs_ordered_extent *ordered;
2020 int ret; 1964 int ret;
2021 int nr = 0; 1965 int nr = 0;
2022 size_t page_offset = 0; 1966 size_t pg_offset = 0;
2023 size_t iosize; 1967 size_t iosize;
2024 size_t disk_io_size; 1968 size_t disk_io_size;
2025 size_t blocksize = inode->i_sb->s_blocksize; 1969 size_t blocksize = inode->i_sb->s_blocksize;
@@ -2027,6 +1971,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2027 1971
2028 set_page_extent_mapped(page); 1972 set_page_extent_mapped(page);
2029 1973
1974 if (!PageUptodate(page)) {
1975 if (cleancache_get_page(page) == 0) {
1976 BUG_ON(blocksize != PAGE_SIZE);
1977 goto out;
1978 }
1979 }
1980
2030 end = page_end; 1981 end = page_end;
2031 while (1) { 1982 while (1) {
2032 lock_extent(tree, start, end, GFP_NOFS); 1983 lock_extent(tree, start, end, GFP_NOFS);
@@ -2053,19 +2004,22 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2053 while (cur <= end) { 2004 while (cur <= end) {
2054 if (cur >= last_byte) { 2005 if (cur >= last_byte) {
2055 char *userpage; 2006 char *userpage;
2056 iosize = PAGE_CACHE_SIZE - page_offset; 2007 struct extent_state *cached = NULL;
2008
2009 iosize = PAGE_CACHE_SIZE - pg_offset;
2057 userpage = kmap_atomic(page, KM_USER0); 2010 userpage = kmap_atomic(page, KM_USER0);
2058 memset(userpage + page_offset, 0, iosize); 2011 memset(userpage + pg_offset, 0, iosize);
2059 flush_dcache_page(page); 2012 flush_dcache_page(page);
2060 kunmap_atomic(userpage, KM_USER0); 2013 kunmap_atomic(userpage, KM_USER0);
2061 set_extent_uptodate(tree, cur, cur + iosize - 1, 2014 set_extent_uptodate(tree, cur, cur + iosize - 1,
2062 GFP_NOFS); 2015 &cached, GFP_NOFS);
2063 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2016 unlock_extent_cached(tree, cur, cur + iosize - 1,
2017 &cached, GFP_NOFS);
2064 break; 2018 break;
2065 } 2019 }
2066 em = get_extent(inode, page, page_offset, cur, 2020 em = get_extent(inode, page, pg_offset, cur,
2067 end - cur + 1, 0); 2021 end - cur + 1, 0);
2068 if (IS_ERR(em) || !em) { 2022 if (IS_ERR_OR_NULL(em)) {
2069 SetPageError(page); 2023 SetPageError(page);
2070 unlock_extent(tree, cur, end, GFP_NOFS); 2024 unlock_extent(tree, cur, end, GFP_NOFS);
2071 break; 2025 break;
@@ -2074,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2074 BUG_ON(extent_map_end(em) <= cur); 2028 BUG_ON(extent_map_end(em) <= cur);
2075 BUG_ON(end < cur); 2029 BUG_ON(end < cur);
2076 2030
2077 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2031 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2078 this_bio_flag = EXTENT_BIO_COMPRESSED; 2032 this_bio_flag = EXTENT_BIO_COMPRESSED;
2033 extent_set_compress_type(&this_bio_flag,
2034 em->compress_type);
2035 }
2079 2036
2080 iosize = min(extent_map_end(em) - cur, end - cur + 1); 2037 iosize = min(extent_map_end(em) - cur, end - cur + 1);
2081 cur_end = min(extent_map_end(em) - 1, end); 2038 cur_end = min(extent_map_end(em) - 1, end);
@@ -2097,16 +2054,19 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2097 /* we've found a hole, just zero and go on */ 2054 /* we've found a hole, just zero and go on */
2098 if (block_start == EXTENT_MAP_HOLE) { 2055 if (block_start == EXTENT_MAP_HOLE) {
2099 char *userpage; 2056 char *userpage;
2057 struct extent_state *cached = NULL;
2058
2100 userpage = kmap_atomic(page, KM_USER0); 2059 userpage = kmap_atomic(page, KM_USER0);
2101 memset(userpage + page_offset, 0, iosize); 2060 memset(userpage + pg_offset, 0, iosize);
2102 flush_dcache_page(page); 2061 flush_dcache_page(page);
2103 kunmap_atomic(userpage, KM_USER0); 2062 kunmap_atomic(userpage, KM_USER0);
2104 2063
2105 set_extent_uptodate(tree, cur, cur + iosize - 1, 2064 set_extent_uptodate(tree, cur, cur + iosize - 1,
2106 GFP_NOFS); 2065 &cached, GFP_NOFS);
2107 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2066 unlock_extent_cached(tree, cur, cur + iosize - 1,
2067 &cached, GFP_NOFS);
2108 cur = cur + iosize; 2068 cur = cur + iosize;
2109 page_offset += iosize; 2069 pg_offset += iosize;
2110 continue; 2070 continue;
2111 } 2071 }
2112 /* the get_extent function already copied into the page */ 2072 /* the get_extent function already copied into the page */
@@ -2115,7 +2075,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2115 check_page_uptodate(tree, page); 2075 check_page_uptodate(tree, page);
2116 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2076 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2117 cur = cur + iosize; 2077 cur = cur + iosize;
2118 page_offset += iosize; 2078 pg_offset += iosize;
2119 continue; 2079 continue;
2120 } 2080 }
2121 /* we have an inline extent but it didn't get marked up 2081 /* we have an inline extent but it didn't get marked up
@@ -2125,7 +2085,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2125 SetPageError(page); 2085 SetPageError(page);
2126 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); 2086 unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2127 cur = cur + iosize; 2087 cur = cur + iosize;
2128 page_offset += iosize; 2088 pg_offset += iosize;
2129 continue; 2089 continue;
2130 } 2090 }
2131 2091
@@ -2138,7 +2098,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2138 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2098 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2139 pnr -= page->index; 2099 pnr -= page->index;
2140 ret = submit_extent_page(READ, tree, page, 2100 ret = submit_extent_page(READ, tree, page,
2141 sector, disk_io_size, page_offset, 2101 sector, disk_io_size, pg_offset,
2142 bdev, bio, pnr, 2102 bdev, bio, pnr,
2143 end_bio_extent_readpage, mirror_num, 2103 end_bio_extent_readpage, mirror_num,
2144 *bio_flags, 2104 *bio_flags,
@@ -2149,8 +2109,9 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
2149 if (ret) 2109 if (ret)
2150 SetPageError(page); 2110 SetPageError(page);
2151 cur = cur + iosize; 2111 cur = cur + iosize;
2152 page_offset += iosize; 2112 pg_offset += iosize;
2153 } 2113 }
2114out:
2154 if (!nr) { 2115 if (!nr) {
2155 if (!PageError(page)) 2116 if (!PageError(page))
2156 SetPageUptodate(page); 2117 SetPageUptodate(page);
@@ -2169,7 +2130,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2169 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2130 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2170 &bio_flags); 2131 &bio_flags);
2171 if (bio) 2132 if (bio)
2172 submit_one_bio(READ, bio, 0, bio_flags); 2133 ret = submit_one_bio(READ, bio, 0, bio_flags);
2173 return ret; 2134 return ret;
2174} 2135}
2175 2136
@@ -2204,7 +2165,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2204 u64 last_byte = i_size_read(inode); 2165 u64 last_byte = i_size_read(inode);
2205 u64 block_start; 2166 u64 block_start;
2206 u64 iosize; 2167 u64 iosize;
2207 u64 unlock_start;
2208 sector_t sector; 2168 sector_t sector;
2209 struct extent_state *cached_state = NULL; 2169 struct extent_state *cached_state = NULL;
2210 struct extent_map *em; 2170 struct extent_map *em;
@@ -2223,10 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2223 unsigned long nr_written = 0; 2183 unsigned long nr_written = 0;
2224 2184
2225 if (wbc->sync_mode == WB_SYNC_ALL) 2185 if (wbc->sync_mode == WB_SYNC_ALL)
2226 write_flags = WRITE_SYNC_PLUG; 2186 write_flags = WRITE_SYNC;
2227 else 2187 else
2228 write_flags = WRITE; 2188 write_flags = WRITE;
2229 2189
2190 trace___extent_writepage(page, inode, wbc);
2191
2230 WARN_ON(!PageLocked(page)); 2192 WARN_ON(!PageLocked(page));
2231 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2193 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2232 if (page->index > end_index || 2194 if (page->index > end_index ||
@@ -2329,7 +2291,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2329 if (tree->ops && tree->ops->writepage_end_io_hook) 2291 if (tree->ops && tree->ops->writepage_end_io_hook)
2330 tree->ops->writepage_end_io_hook(page, start, 2292 tree->ops->writepage_end_io_hook(page, start,
2331 page_end, NULL, 1); 2293 page_end, NULL, 1);
2332 unlock_start = page_end + 1;
2333 goto done; 2294 goto done;
2334 } 2295 }
2335 2296
@@ -2340,12 +2301,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2340 if (tree->ops && tree->ops->writepage_end_io_hook) 2301 if (tree->ops && tree->ops->writepage_end_io_hook)
2341 tree->ops->writepage_end_io_hook(page, cur, 2302 tree->ops->writepage_end_io_hook(page, cur,
2342 page_end, NULL, 1); 2303 page_end, NULL, 1);
2343 unlock_start = page_end + 1;
2344 break; 2304 break;
2345 } 2305 }
2346 em = epd->get_extent(inode, page, pg_offset, cur, 2306 em = epd->get_extent(inode, page, pg_offset, cur,
2347 end - cur + 1, 1); 2307 end - cur + 1, 1);
2348 if (IS_ERR(em) || !em) { 2308 if (IS_ERR_OR_NULL(em)) {
2349 SetPageError(page); 2309 SetPageError(page);
2350 break; 2310 break;
2351 } 2311 }
@@ -2387,7 +2347,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2387 2347
2388 cur += iosize; 2348 cur += iosize;
2389 pg_offset += iosize; 2349 pg_offset += iosize;
2390 unlock_start = cur;
2391 continue; 2350 continue;
2392 } 2351 }
2393 /* leave this out until we have a page_mkwrite call */ 2352 /* leave this out until we have a page_mkwrite call */
@@ -2473,7 +2432,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2473 pgoff_t index; 2432 pgoff_t index;
2474 pgoff_t end; /* Inclusive */ 2433 pgoff_t end; /* Inclusive */
2475 int scanned = 0; 2434 int scanned = 0;
2476 int range_whole = 0;
2477 2435
2478 pagevec_init(&pvec, 0); 2436 pagevec_init(&pvec, 0);
2479 if (wbc->range_cyclic) { 2437 if (wbc->range_cyclic) {
@@ -2482,8 +2440,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
2482 } else { 2440 } else {
2483 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2441 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2484 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2442 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2485 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2486 range_whole = 1;
2487 scanned = 1; 2443 scanned = 1;
2488 } 2444 }
2489retry: 2445retry:
@@ -2689,7 +2645,7 @@ int extent_readpages(struct extent_io_tree *tree,
2689 prefetchw(&page->flags); 2645 prefetchw(&page->flags);
2690 list_del(&page->lru); 2646 list_del(&page->lru);
2691 if (!add_to_page_cache_lru(page, mapping, 2647 if (!add_to_page_cache_lru(page, mapping,
2692 page->index, GFP_KERNEL)) { 2648 page->index, GFP_NOFS)) {
2693 __extent_read_full_page(tree, page, get_extent, 2649 __extent_read_full_page(tree, page, get_extent,
2694 &bio, 0, &bio_flags); 2650 &bio, 0, &bio_flags);
2695 } 2651 }
@@ -2728,123 +2684,6 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2728} 2684}
2729 2685
2730/* 2686/*
2731 * simple commit_write call, set_range_dirty is used to mark both
2732 * the pages and the extent records as dirty
2733 */
2734int extent_commit_write(struct extent_io_tree *tree,
2735 struct inode *inode, struct page *page,
2736 unsigned from, unsigned to)
2737{
2738 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2739
2740 set_page_extent_mapped(page);
2741 set_page_dirty(page);
2742
2743 if (pos > inode->i_size) {
2744 i_size_write(inode, pos);
2745 mark_inode_dirty(inode);
2746 }
2747 return 0;
2748}
2749
2750int extent_prepare_write(struct extent_io_tree *tree,
2751 struct inode *inode, struct page *page,
2752 unsigned from, unsigned to, get_extent_t *get_extent)
2753{
2754 u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2755 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2756 u64 block_start;
2757 u64 orig_block_start;
2758 u64 block_end;
2759 u64 cur_end;
2760 struct extent_map *em;
2761 unsigned blocksize = 1 << inode->i_blkbits;
2762 size_t page_offset = 0;
2763 size_t block_off_start;
2764 size_t block_off_end;
2765 int err = 0;
2766 int iocount = 0;
2767 int ret = 0;
2768 int isnew;
2769
2770 set_page_extent_mapped(page);
2771
2772 block_start = (page_start + from) & ~((u64)blocksize - 1);
2773 block_end = (page_start + to - 1) | (blocksize - 1);
2774 orig_block_start = block_start;
2775
2776 lock_extent(tree, page_start, page_end, GFP_NOFS);
2777 while (block_start <= block_end) {
2778 em = get_extent(inode, page, page_offset, block_start,
2779 block_end - block_start + 1, 1);
2780 if (IS_ERR(em) || !em)
2781 goto err;
2782
2783 cur_end = min(block_end, extent_map_end(em) - 1);
2784 block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2785 block_off_end = block_off_start + blocksize;
2786 isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2787
2788 if (!PageUptodate(page) && isnew &&
2789 (block_off_end > to || block_off_start < from)) {
2790 void *kaddr;
2791
2792 kaddr = kmap_atomic(page, KM_USER0);
2793 if (block_off_end > to)
2794 memset(kaddr + to, 0, block_off_end - to);
2795 if (block_off_start < from)
2796 memset(kaddr + block_off_start, 0,
2797 from - block_off_start);
2798 flush_dcache_page(page);
2799 kunmap_atomic(kaddr, KM_USER0);
2800 }
2801 if ((em->block_start != EXTENT_MAP_HOLE &&
2802 em->block_start != EXTENT_MAP_INLINE) &&
2803 !isnew && !PageUptodate(page) &&
2804 (block_off_end > to || block_off_start < from) &&
2805 !test_range_bit(tree, block_start, cur_end,
2806 EXTENT_UPTODATE, 1, NULL)) {
2807 u64 sector;
2808 u64 extent_offset = block_start - em->start;
2809 size_t iosize;
2810 sector = (em->block_start + extent_offset) >> 9;
2811 iosize = (cur_end - block_start + blocksize) &
2812 ~((u64)blocksize - 1);
2813 /*
2814 * we've already got the extent locked, but we
2815 * need to split the state such that our end_bio
2816 * handler can clear the lock.
2817 */
2818 set_extent_bit(tree, block_start,
2819 block_start + iosize - 1,
2820 EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
2821 ret = submit_extent_page(READ, tree, page,
2822 sector, iosize, page_offset, em->bdev,
2823 NULL, 1,
2824 end_bio_extent_preparewrite, 0,
2825 0, 0);
2826 iocount++;
2827 block_start = block_start + iosize;
2828 } else {
2829 set_extent_uptodate(tree, block_start, cur_end,
2830 GFP_NOFS);
2831 unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2832 block_start = cur_end + 1;
2833 }
2834 page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2835 free_extent_map(em);
2836 }
2837 if (iocount) {
2838 wait_extent_bit(tree, orig_block_start,
2839 block_end, EXTENT_LOCKED);
2840 }
2841 check_page_uptodate(tree, page);
2842err:
2843 /* FIXME, zero out newly allocated blocks on error */
2844 return err;
2845}
2846
2847/*
2848 * a helper for releasepage, this tests for areas of the page that 2687 * a helper for releasepage, this tests for areas of the page that
2849 * are locked or under IO and drops the related state bits if it is safe 2688 * are locked or under IO and drops the related state bits if it is safe
2850 * to drop the page. 2689 * to drop the page.
@@ -2867,9 +2706,17 @@ int try_release_extent_state(struct extent_map_tree *map,
2867 * at this point we can safely clear everything except the 2706 * at this point we can safely clear everything except the
2868 * locked bit and the nodatasum bit 2707 * locked bit and the nodatasum bit
2869 */ 2708 */
2870 clear_extent_bit(tree, start, end, 2709 ret = clear_extent_bit(tree, start, end,
2871 ~(EXTENT_LOCKED | EXTENT_NODATASUM), 2710 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2872 0, 0, NULL, mask); 2711 0, 0, NULL, mask);
2712
2713 /* if clear_extent_bit failed for enomem reasons,
2714 * we can't allow the release to continue.
2715 */
2716 if (ret < 0)
2717 ret = 0;
2718 else
2719 ret = 1;
2873 } 2720 }
2874 return ret; 2721 return ret;
2875} 2722}
@@ -2894,7 +2741,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
2894 len = end - start + 1; 2741 len = end - start + 1;
2895 write_lock(&map->lock); 2742 write_lock(&map->lock);
2896 em = lookup_extent_mapping(map, start, len); 2743 em = lookup_extent_mapping(map, start, len);
2897 if (!em || IS_ERR(em)) { 2744 if (IS_ERR_OR_NULL(em)) {
2898 write_unlock(&map->lock); 2745 write_unlock(&map->lock);
2899 break; 2746 break;
2900 } 2747 }
@@ -2922,76 +2769,169 @@ int try_release_extent_mapping(struct extent_map_tree *map,
2922 return try_release_extent_state(map, tree, page, mask); 2769 return try_release_extent_state(map, tree, page, mask);
2923} 2770}
2924 2771
2925sector_t extent_bmap(struct address_space *mapping, sector_t iblock, 2772/*
2926 get_extent_t *get_extent) 2773 * helper function for fiemap, which doesn't want to see any holes.
2774 * This maps until we find something past 'last'
2775 */
2776static struct extent_map *get_extent_skip_holes(struct inode *inode,
2777 u64 offset,
2778 u64 last,
2779 get_extent_t *get_extent)
2927{ 2780{
2928 struct inode *inode = mapping->host; 2781 u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
2929 struct extent_state *cached_state = NULL;
2930 u64 start = iblock << inode->i_blkbits;
2931 sector_t sector = 0;
2932 size_t blksize = (1 << inode->i_blkbits);
2933 struct extent_map *em; 2782 struct extent_map *em;
2783 u64 len;
2934 2784
2935 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2785 if (offset >= last)
2936 0, &cached_state, GFP_NOFS); 2786 return NULL;
2937 em = get_extent(inode, NULL, 0, start, blksize, 0);
2938 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2939 start + blksize - 1, &cached_state, GFP_NOFS);
2940 if (!em || IS_ERR(em))
2941 return 0;
2942 2787
2943 if (em->block_start > EXTENT_MAP_LAST_BYTE) 2788 while(1) {
2944 goto out; 2789 len = last - offset;
2790 if (len == 0)
2791 break;
2792 len = (len + sectorsize - 1) & ~(sectorsize - 1);
2793 em = get_extent(inode, NULL, 0, offset, len, 0);
2794 if (IS_ERR_OR_NULL(em))
2795 return em;
2945 2796
2946 sector = (em->block_start + start - em->start) >> inode->i_blkbits; 2797 /* if this isn't a hole return it */
2947out: 2798 if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
2948 free_extent_map(em); 2799 em->block_start != EXTENT_MAP_HOLE) {
2949 return sector; 2800 return em;
2801 }
2802
2803 /* this is a hole, advance to the next extent */
2804 offset = extent_map_end(em);
2805 free_extent_map(em);
2806 if (offset >= last)
2807 break;
2808 }
2809 return NULL;
2950} 2810}
2951 2811
2952int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 2812int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2953 __u64 start, __u64 len, get_extent_t *get_extent) 2813 __u64 start, __u64 len, get_extent_t *get_extent)
2954{ 2814{
2955 int ret; 2815 int ret = 0;
2956 u64 off = start; 2816 u64 off = start;
2957 u64 max = start + len; 2817 u64 max = start + len;
2958 u32 flags = 0; 2818 u32 flags = 0;
2819 u32 found_type;
2820 u64 last;
2821 u64 last_for_get_extent = 0;
2959 u64 disko = 0; 2822 u64 disko = 0;
2823 u64 isize = i_size_read(inode);
2824 struct btrfs_key found_key;
2960 struct extent_map *em = NULL; 2825 struct extent_map *em = NULL;
2961 struct extent_state *cached_state = NULL; 2826 struct extent_state *cached_state = NULL;
2827 struct btrfs_path *path;
2828 struct btrfs_file_extent_item *item;
2962 int end = 0; 2829 int end = 0;
2963 u64 em_start = 0, em_len = 0; 2830 u64 em_start = 0;
2831 u64 em_len = 0;
2832 u64 em_end = 0;
2964 unsigned long emflags; 2833 unsigned long emflags;
2965 ret = 0;
2966 2834
2967 if (len == 0) 2835 if (len == 0)
2968 return -EINVAL; 2836 return -EINVAL;
2969 2837
2838 path = btrfs_alloc_path();
2839 if (!path)
2840 return -ENOMEM;
2841 path->leave_spinning = 1;
2842
2843 /*
2844 * lookup the last file extent. We're not using i_size here
2845 * because there might be preallocation past i_size
2846 */
2847 ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
2848 path, btrfs_ino(inode), -1, 0);
2849 if (ret < 0) {
2850 btrfs_free_path(path);
2851 return ret;
2852 }
2853 WARN_ON(!ret);
2854 path->slots[0]--;
2855 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2856 struct btrfs_file_extent_item);
2857 btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
2858 found_type = btrfs_key_type(&found_key);
2859
2860 /* No extents, but there might be delalloc bits */
2861 if (found_key.objectid != btrfs_ino(inode) ||
2862 found_type != BTRFS_EXTENT_DATA_KEY) {
2863 /* have to trust i_size as the end */
2864 last = (u64)-1;
2865 last_for_get_extent = isize;
2866 } else {
2867 /*
2868 * remember the start of the last extent. There are a
2869 * bunch of different factors that go into the length of the
2870 * extent, so its much less complex to remember where it started
2871 */
2872 last = found_key.offset;
2873 last_for_get_extent = last + 1;
2874 }
2875 btrfs_free_path(path);
2876
2877 /*
2878 * we might have some extents allocated but more delalloc past those
2879 * extents. so, we trust isize unless the start of the last extent is
2880 * beyond isize
2881 */
2882 if (last < isize) {
2883 last = (u64)-1;
2884 last_for_get_extent = isize;
2885 }
2886
2970 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 2887 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2971 &cached_state, GFP_NOFS); 2888 &cached_state, GFP_NOFS);
2972 em = get_extent(inode, NULL, 0, off, max - off, 0); 2889
2890 em = get_extent_skip_holes(inode, off, last_for_get_extent,
2891 get_extent);
2973 if (!em) 2892 if (!em)
2974 goto out; 2893 goto out;
2975 if (IS_ERR(em)) { 2894 if (IS_ERR(em)) {
2976 ret = PTR_ERR(em); 2895 ret = PTR_ERR(em);
2977 goto out; 2896 goto out;
2978 } 2897 }
2898
2979 while (!end) { 2899 while (!end) {
2980 off = em->start + em->len; 2900 u64 offset_in_extent;
2981 if (off >= max) 2901
2982 end = 1; 2902 /* break if the extent we found is outside the range */
2903 if (em->start >= max || extent_map_end(em) < off)
2904 break;
2983 2905
2984 em_start = em->start; 2906 /*
2985 em_len = em->len; 2907 * get_extent may return an extent that starts before our
2908 * requested range. We have to make sure the ranges
2909 * we return to fiemap always move forward and don't
2910 * overlap, so adjust the offsets here
2911 */
2912 em_start = max(em->start, off);
2986 2913
2914 /*
2915 * record the offset from the start of the extent
2916 * for adjusting the disk offset below
2917 */
2918 offset_in_extent = em_start - em->start;
2919 em_end = extent_map_end(em);
2920 em_len = em_end - em_start;
2921 emflags = em->flags;
2987 disko = 0; 2922 disko = 0;
2988 flags = 0; 2923 flags = 0;
2989 2924
2925 /*
2926 * bump off for our next call to get_extent
2927 */
2928 off = extent_map_end(em);
2929 if (off >= max)
2930 end = 1;
2931
2990 if (em->block_start == EXTENT_MAP_LAST_BYTE) { 2932 if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2991 end = 1; 2933 end = 1;
2992 flags |= FIEMAP_EXTENT_LAST; 2934 flags |= FIEMAP_EXTENT_LAST;
2993 } else if (em->block_start == EXTENT_MAP_HOLE) {
2994 flags |= FIEMAP_EXTENT_UNWRITTEN;
2995 } else if (em->block_start == EXTENT_MAP_INLINE) { 2935 } else if (em->block_start == EXTENT_MAP_INLINE) {
2996 flags |= (FIEMAP_EXTENT_DATA_INLINE | 2936 flags |= (FIEMAP_EXTENT_DATA_INLINE |
2997 FIEMAP_EXTENT_NOT_ALIGNED); 2937 FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2999,32 +2939,32 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2999 flags |= (FIEMAP_EXTENT_DELALLOC | 2939 flags |= (FIEMAP_EXTENT_DELALLOC |
3000 FIEMAP_EXTENT_UNKNOWN); 2940 FIEMAP_EXTENT_UNKNOWN);
3001 } else { 2941 } else {
3002 disko = em->block_start; 2942 disko = em->block_start + offset_in_extent;
3003 } 2943 }
3004 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) 2944 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3005 flags |= FIEMAP_EXTENT_ENCODED; 2945 flags |= FIEMAP_EXTENT_ENCODED;
3006 2946
3007 emflags = em->flags;
3008 free_extent_map(em); 2947 free_extent_map(em);
3009 em = NULL; 2948 em = NULL;
2949 if ((em_start >= last) || em_len == (u64)-1 ||
2950 (last == (u64)-1 && isize <= em_end)) {
2951 flags |= FIEMAP_EXTENT_LAST;
2952 end = 1;
2953 }
3010 2954
3011 if (!end) { 2955 /* now scan forward to see if this is really the last extent. */
3012 em = get_extent(inode, NULL, 0, off, max - off, 0); 2956 em = get_extent_skip_holes(inode, off, last_for_get_extent,
3013 if (!em) 2957 get_extent);
3014 goto out; 2958 if (IS_ERR(em)) {
3015 if (IS_ERR(em)) { 2959 ret = PTR_ERR(em);
3016 ret = PTR_ERR(em); 2960 goto out;
3017 goto out;
3018 }
3019 emflags = em->flags;
3020 } 2961 }
3021 if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) { 2962 if (!em) {
3022 flags |= FIEMAP_EXTENT_LAST; 2963 flags |= FIEMAP_EXTENT_LAST;
3023 end = 1; 2964 end = 1;
3024 } 2965 }
3025
3026 ret = fiemap_fill_next_extent(fieinfo, em_start, disko, 2966 ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3027 em_len, flags); 2967 em_len, flags);
3028 if (ret) 2968 if (ret)
3029 goto out_free; 2969 goto out_free;
3030 } 2970 }
@@ -3078,6 +3018,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3078#endif 3018#endif
3079 3019
3080 eb = kmem_cache_zalloc(extent_buffer_cache, mask); 3020 eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3021 if (eb == NULL)
3022 return NULL;
3081 eb->start = start; 3023 eb->start = start;
3082 eb->len = len; 3024 eb->len = len;
3083 spin_lock_init(&eb->lock); 3025 spin_lock_init(&eb->lock);
@@ -3104,10 +3046,42 @@ static void __free_extent_buffer(struct extent_buffer *eb)
3104 kmem_cache_free(extent_buffer_cache, eb); 3046 kmem_cache_free(extent_buffer_cache, eb);
3105} 3047}
3106 3048
3049/*
3050 * Helper for releasing extent buffer page.
3051 */
3052static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
3053 unsigned long start_idx)
3054{
3055 unsigned long index;
3056 struct page *page;
3057
3058 if (!eb->first_page)
3059 return;
3060
3061 index = num_extent_pages(eb->start, eb->len);
3062 if (start_idx >= index)
3063 return;
3064
3065 do {
3066 index--;
3067 page = extent_buffer_page(eb, index);
3068 if (page)
3069 page_cache_release(page);
3070 } while (index != start_idx);
3071}
3072
3073/*
3074 * Helper for releasing the extent buffer.
3075 */
3076static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
3077{
3078 btrfs_release_extent_buffer_page(eb, 0);
3079 __free_extent_buffer(eb);
3080}
3081
3107struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 3082struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3108 u64 start, unsigned long len, 3083 u64 start, unsigned long len,
3109 struct page *page0, 3084 struct page *page0)
3110 gfp_t mask)
3111{ 3085{
3112 unsigned long num_pages = num_extent_pages(start, len); 3086 unsigned long num_pages = num_extent_pages(start, len);
3113 unsigned long i; 3087 unsigned long i;
@@ -3117,18 +3091,18 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3117 struct page *p; 3091 struct page *p;
3118 struct address_space *mapping = tree->mapping; 3092 struct address_space *mapping = tree->mapping;
3119 int uptodate = 1; 3093 int uptodate = 1;
3094 int ret;
3120 3095
3121 spin_lock(&tree->buffer_lock); 3096 rcu_read_lock();
3122 eb = buffer_search(tree, start); 3097 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3123 if (eb) { 3098 if (eb && atomic_inc_not_zero(&eb->refs)) {
3124 atomic_inc(&eb->refs); 3099 rcu_read_unlock();
3125 spin_unlock(&tree->buffer_lock);
3126 mark_page_accessed(eb->first_page); 3100 mark_page_accessed(eb->first_page);
3127 return eb; 3101 return eb;
3128 } 3102 }
3129 spin_unlock(&tree->buffer_lock); 3103 rcu_read_unlock();
3130 3104
3131 eb = __alloc_extent_buffer(tree, start, len, mask); 3105 eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
3132 if (!eb) 3106 if (!eb)
3133 return NULL; 3107 return NULL;
3134 3108
@@ -3145,7 +3119,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3145 i = 0; 3119 i = 0;
3146 } 3120 }
3147 for (; i < num_pages; i++, index++) { 3121 for (; i < num_pages; i++, index++) {
3148 p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); 3122 p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
3149 if (!p) { 3123 if (!p) {
3150 WARN_ON(1); 3124 WARN_ON(1);
3151 goto free_eb; 3125 goto free_eb;
@@ -3160,50 +3134,77 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3160 } 3134 }
3161 if (!PageUptodate(p)) 3135 if (!PageUptodate(p))
3162 uptodate = 0; 3136 uptodate = 0;
3163 unlock_page(p); 3137
3138 /*
3139 * see below about how we avoid a nasty race with release page
3140 * and why we unlock later
3141 */
3142 if (i != 0)
3143 unlock_page(p);
3164 } 3144 }
3165 if (uptodate) 3145 if (uptodate)
3166 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3146 set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3167 3147
3148 ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
3149 if (ret)
3150 goto free_eb;
3151
3168 spin_lock(&tree->buffer_lock); 3152 spin_lock(&tree->buffer_lock);
3169 exists = buffer_tree_insert(tree, start, &eb->rb_node); 3153 ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
3170 if (exists) { 3154 if (ret == -EEXIST) {
3155 exists = radix_tree_lookup(&tree->buffer,
3156 start >> PAGE_CACHE_SHIFT);
3171 /* add one reference for the caller */ 3157 /* add one reference for the caller */
3172 atomic_inc(&exists->refs); 3158 atomic_inc(&exists->refs);
3173 spin_unlock(&tree->buffer_lock); 3159 spin_unlock(&tree->buffer_lock);
3160 radix_tree_preload_end();
3174 goto free_eb; 3161 goto free_eb;
3175 } 3162 }
3176 /* add one reference for the tree */ 3163 /* add one reference for the tree */
3177 atomic_inc(&eb->refs); 3164 atomic_inc(&eb->refs);
3178 spin_unlock(&tree->buffer_lock); 3165 spin_unlock(&tree->buffer_lock);
3166 radix_tree_preload_end();
3167
3168 /*
3169 * there is a race where release page may have
3170 * tried to find this extent buffer in the radix
3171 * but failed. It will tell the VM it is safe to
3172 * reclaim the, and it will clear the page private bit.
3173 * We must make sure to set the page private bit properly
3174 * after the extent buffer is in the radix tree so
3175 * it doesn't get lost
3176 */
3177 set_page_extent_mapped(eb->first_page);
3178 set_page_extent_head(eb->first_page, eb->len);
3179 if (!page0)
3180 unlock_page(eb->first_page);
3179 return eb; 3181 return eb;
3180 3182
3181free_eb: 3183free_eb:
3184 if (eb->first_page && !page0)
3185 unlock_page(eb->first_page);
3186
3182 if (!atomic_dec_and_test(&eb->refs)) 3187 if (!atomic_dec_and_test(&eb->refs))
3183 return exists; 3188 return exists;
3184 for (index = 1; index < i; index++) 3189 btrfs_release_extent_buffer(eb);
3185 page_cache_release(extent_buffer_page(eb, index));
3186 page_cache_release(extent_buffer_page(eb, 0));
3187 __free_extent_buffer(eb);
3188 return exists; 3190 return exists;
3189} 3191}
3190 3192
3191struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 3193struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3192 u64 start, unsigned long len, 3194 u64 start, unsigned long len)
3193 gfp_t mask)
3194{ 3195{
3195 struct extent_buffer *eb; 3196 struct extent_buffer *eb;
3196 3197
3197 spin_lock(&tree->buffer_lock); 3198 rcu_read_lock();
3198 eb = buffer_search(tree, start); 3199 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3199 if (eb) 3200 if (eb && atomic_inc_not_zero(&eb->refs)) {
3200 atomic_inc(&eb->refs); 3201 rcu_read_unlock();
3201 spin_unlock(&tree->buffer_lock);
3202
3203 if (eb)
3204 mark_page_accessed(eb->first_page); 3202 mark_page_accessed(eb->first_page);
3203 return eb;
3204 }
3205 rcu_read_unlock();
3205 3206
3206 return eb; 3207 return NULL;
3207} 3208}
3208 3209
3209void free_extent_buffer(struct extent_buffer *eb) 3210void free_extent_buffer(struct extent_buffer *eb)
@@ -3232,10 +3233,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3232 continue; 3233 continue;
3233 3234
3234 lock_page(page); 3235 lock_page(page);
3236 WARN_ON(!PagePrivate(page));
3237
3238 set_page_extent_mapped(page);
3235 if (i == 0) 3239 if (i == 0)
3236 set_page_extent_head(page, eb->len); 3240 set_page_extent_head(page, eb->len);
3237 else
3238 set_page_private(page, EXTENT_PAGE_PRIVATE);
3239 3241
3240 clear_page_dirty_for_io(page); 3242 clear_page_dirty_for_io(page);
3241 spin_lock_irq(&page->mapping->tree_lock); 3243 spin_lock_irq(&page->mapping->tree_lock);
@@ -3250,13 +3252,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3250 return 0; 3252 return 0;
3251} 3253}
3252 3254
3253int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3254 struct extent_buffer *eb)
3255{
3256 return wait_on_extent_writeback(tree, eb->start,
3257 eb->start + eb->len - 1);
3258}
3259
3260int set_extent_buffer_dirty(struct extent_io_tree *tree, 3255int set_extent_buffer_dirty(struct extent_io_tree *tree,
3261 struct extent_buffer *eb) 3256 struct extent_buffer *eb)
3262{ 3257{
@@ -3302,7 +3297,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3302 num_pages = num_extent_pages(eb->start, eb->len); 3297 num_pages = num_extent_pages(eb->start, eb->len);
3303 3298
3304 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3299 set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3305 GFP_NOFS); 3300 NULL, GFP_NOFS);
3306 for (i = 0; i < num_pages; i++) { 3301 for (i = 0; i < num_pages; i++) {
3307 page = extent_buffer_page(eb, i); 3302 page = extent_buffer_page(eb, i);
3308 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || 3303 if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3425,6 +3420,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3425 3420
3426 for (i = start_i; i < num_pages; i++) { 3421 for (i = start_i; i < num_pages; i++) {
3427 page = extent_buffer_page(eb, i); 3422 page = extent_buffer_page(eb, i);
3423
3424 WARN_ON(!PagePrivate(page));
3425
3426 set_page_extent_mapped(page);
3427 if (i == 0)
3428 set_page_extent_head(page, eb->len);
3429
3428 if (inc_all_pages) 3430 if (inc_all_pages)
3429 page_cache_get(page); 3431 page_cache_get(page);
3430 if (!PageUptodate(page)) { 3432 if (!PageUptodate(page)) {
@@ -3530,6 +3532,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3530 "wanted %lu %lu\n", (unsigned long long)eb->start, 3532 "wanted %lu %lu\n", (unsigned long long)eb->start,
3531 eb->len, start, min_len); 3533 eb->len, start, min_len);
3532 WARN_ON(1); 3534 WARN_ON(1);
3535 return -EINVAL;
3533 } 3536 }
3534 3537
3535 p = extent_buffer_page(eb, i); 3538 p = extent_buffer_page(eb, i);
@@ -3722,6 +3725,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
3722 kunmap_atomic(dst_kaddr, KM_USER0); 3725 kunmap_atomic(dst_kaddr, KM_USER0);
3723} 3726}
3724 3727
3728static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
3729{
3730 unsigned long distance = (src > dst) ? src - dst : dst - src;
3731 return distance < len;
3732}
3733
3725static void copy_pages(struct page *dst_page, struct page *src_page, 3734static void copy_pages(struct page *dst_page, struct page *src_page,
3726 unsigned long dst_off, unsigned long src_off, 3735 unsigned long dst_off, unsigned long src_off,
3727 unsigned long len) 3736 unsigned long len)
@@ -3729,10 +3738,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
3729 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); 3738 char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3730 char *src_kaddr; 3739 char *src_kaddr;
3731 3740
3732 if (dst_page != src_page) 3741 if (dst_page != src_page) {
3733 src_kaddr = kmap_atomic(src_page, KM_USER1); 3742 src_kaddr = kmap_atomic(src_page, KM_USER1);
3734 else 3743 } else {
3735 src_kaddr = dst_kaddr; 3744 src_kaddr = dst_kaddr;
3745 BUG_ON(areas_overlap(src_off, dst_off, len));
3746 }
3736 3747
3737 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); 3748 memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3738 kunmap_atomic(dst_kaddr, KM_USER0); 3749 kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3807,7 +3818,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3807 "len %lu len %lu\n", dst_offset, len, dst->len); 3818 "len %lu len %lu\n", dst_offset, len, dst->len);
3808 BUG_ON(1); 3819 BUG_ON(1);
3809 } 3820 }
3810 if (dst_offset < src_offset) { 3821 if (!areas_overlap(src_offset, dst_offset, len)) {
3811 memcpy_extent_buffer(dst, dst_offset, src_offset, len); 3822 memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3812 return; 3823 return;
3813 } 3824 }
@@ -3833,34 +3844,47 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3833 } 3844 }
3834} 3845}
3835 3846
3847static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
3848{
3849 struct extent_buffer *eb =
3850 container_of(head, struct extent_buffer, rcu_head);
3851
3852 btrfs_release_extent_buffer(eb);
3853}
3854
3836int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) 3855int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3837{ 3856{
3838 u64 start = page_offset(page); 3857 u64 start = page_offset(page);
3839 struct extent_buffer *eb; 3858 struct extent_buffer *eb;
3840 int ret = 1; 3859 int ret = 1;
3841 unsigned long i;
3842 unsigned long num_pages;
3843 3860
3844 spin_lock(&tree->buffer_lock); 3861 spin_lock(&tree->buffer_lock);
3845 eb = buffer_search(tree, start); 3862 eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3846 if (!eb) 3863 if (!eb) {
3847 goto out; 3864 spin_unlock(&tree->buffer_lock);
3865 return ret;
3866 }
3848 3867
3849 if (atomic_read(&eb->refs) > 1) { 3868 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3850 ret = 0; 3869 ret = 0;
3851 goto out; 3870 goto out;
3852 } 3871 }
3853 if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3872
3873 /*
3874 * set @eb->refs to 0 if it is already 1, and then release the @eb.
3875 * Or go back.
3876 */
3877 if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
3854 ret = 0; 3878 ret = 0;
3855 goto out; 3879 goto out;
3856 } 3880 }
3857 /* at this point we can safely release the extent buffer */ 3881
3858 num_pages = num_extent_pages(eb->start, eb->len); 3882 radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
3859 for (i = 0; i < num_pages; i++)
3860 page_cache_release(extent_buffer_page(eb, i));
3861 rb_erase(&eb->rb_node, &tree->buffer);
3862 __free_extent_buffer(eb);
3863out: 3883out:
3864 spin_unlock(&tree->buffer_lock); 3884 spin_unlock(&tree->buffer_lock);
3885
3886 /* at this point we can safely release the extent buffer */
3887 if (atomic_read(&eb->refs) == 0)
3888 call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
3865 return ret; 3889 return ret;
3866} 3890}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5691c7b590da..a11a92ee2d30 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,13 +20,18 @@
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 22
23/* flags for bio submission */ 23/*
24 * flags for bio submission. The high bits indicate the compression
25 * type for this bio
26 */
24#define EXTENT_BIO_COMPRESSED 1 27#define EXTENT_BIO_COMPRESSED 1
28#define EXTENT_BIO_FLAG_SHIFT 16
25 29
26/* these are bit numbers for test/set bit */ 30/* these are bit numbers for test/set bit */
27#define EXTENT_BUFFER_UPTODATE 0 31#define EXTENT_BUFFER_UPTODATE 0
28#define EXTENT_BUFFER_BLOCKING 1 32#define EXTENT_BUFFER_BLOCKING 1
29#define EXTENT_BUFFER_DIRTY 2 33#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3
30 35
31/* these are flags for extent_clear_unlock_delalloc */ 36/* these are flags for extent_clear_unlock_delalloc */
32#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -85,7 +90,7 @@ struct extent_io_ops {
85 90
86struct extent_io_tree { 91struct extent_io_tree {
87 struct rb_root state; 92 struct rb_root state;
88 struct rb_root buffer; 93 struct radix_tree_root buffer;
89 struct address_space *mapping; 94 struct address_space *mapping;
90 u64 dirty_bytes; 95 u64 dirty_bytes;
91 spinlock_t lock; 96 spinlock_t lock;
@@ -121,9 +126,9 @@ struct extent_buffer {
121 unsigned long map_len; 126 unsigned long map_len;
122 struct page *first_page; 127 struct page *first_page;
123 unsigned long bflags; 128 unsigned long bflags;
124 atomic_t refs;
125 struct list_head leak_list; 129 struct list_head leak_list;
126 struct rb_node rb_node; 130 struct rcu_head rcu_head;
131 atomic_t refs;
127 132
128 /* the spinlock is used to protect most operations */ 133 /* the spinlock is used to protect most operations */
129 spinlock_t lock; 134 spinlock_t lock;
@@ -135,25 +140,27 @@ struct extent_buffer {
135 wait_queue_head_t lock_wq; 140 wait_queue_head_t lock_wq;
136}; 141};
137 142
138struct extent_map_tree; 143static inline void extent_set_compress_type(unsigned long *bio_flags,
144 int compress_type)
145{
146 *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
147}
139 148
140static inline struct extent_state *extent_state_next(struct extent_state *state) 149static inline int extent_compress_type(unsigned long bio_flags)
141{ 150{
142 struct rb_node *node; 151 return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
143 node = rb_next(&state->rb_node);
144 if (!node)
145 return NULL;
146 return rb_entry(node, struct extent_state, rb_node);
147} 152}
148 153
154struct extent_map_tree;
155
149typedef struct extent_map *(get_extent_t)(struct inode *inode, 156typedef struct extent_map *(get_extent_t)(struct inode *inode,
150 struct page *page, 157 struct page *page,
151 size_t page_offset, 158 size_t pg_offset,
152 u64 start, u64 len, 159 u64 start, u64 len,
153 int create); 160 int create);
154 161
155void extent_io_tree_init(struct extent_io_tree *tree, 162void extent_io_tree_init(struct extent_io_tree *tree,
156 struct address_space *mapping, gfp_t mask); 163 struct address_space *mapping);
157int try_release_extent_mapping(struct extent_map_tree *map, 164int try_release_extent_mapping(struct extent_map_tree *map,
158 struct extent_io_tree *tree, struct page *page, 165 struct extent_io_tree *tree, struct page *page,
159 gfp_t mask); 166 gfp_t mask);
@@ -176,7 +183,7 @@ void extent_io_exit(void);
176 183
177u64 count_range_bits(struct extent_io_tree *tree, 184u64 count_range_bits(struct extent_io_tree *tree,
178 u64 *start, u64 search_end, 185 u64 *start, u64 search_end,
179 u64 max_bytes, unsigned long bits); 186 u64 max_bytes, unsigned long bits, int contig);
180 187
181void free_extent_state(struct extent_state *state); 188void free_extent_state(struct extent_state *state);
182int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, 189int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -192,21 +199,15 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
192 int bits, int exclusive_bits, u64 *failed_start, 199 int bits, int exclusive_bits, u64 *failed_start,
193 struct extent_state **cached_state, gfp_t mask); 200 struct extent_state **cached_state, gfp_t mask);
194int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, 201int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
195 gfp_t mask); 202 struct extent_state **cached_state, gfp_t mask);
196int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, 203int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
197 gfp_t mask); 204 gfp_t mask);
198int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 205int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
199 gfp_t mask); 206 gfp_t mask);
200int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 207int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
201 gfp_t mask); 208 gfp_t mask);
202int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
203 gfp_t mask);
204int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
205 u64 end, gfp_t mask);
206int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 209int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
207 struct extent_state **cached_state, gfp_t mask); 210 struct extent_state **cached_state, gfp_t mask);
208int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
209 gfp_t mask);
210int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 211int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
211 u64 *start_ret, u64 *end_ret, int bits); 212 u64 *start_ret, u64 *end_ret, int bits);
212struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, 213struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
@@ -227,28 +228,17 @@ int extent_readpages(struct extent_io_tree *tree,
227 struct address_space *mapping, 228 struct address_space *mapping,
228 struct list_head *pages, unsigned nr_pages, 229 struct list_head *pages, unsigned nr_pages,
229 get_extent_t get_extent); 230 get_extent_t get_extent);
230int extent_prepare_write(struct extent_io_tree *tree,
231 struct inode *inode, struct page *page,
232 unsigned from, unsigned to, get_extent_t *get_extent);
233int extent_commit_write(struct extent_io_tree *tree,
234 struct inode *inode, struct page *page,
235 unsigned from, unsigned to);
236sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
237 get_extent_t *get_extent);
238int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 231int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
239 __u64 start, __u64 len, get_extent_t *get_extent); 232 __u64 start, __u64 len, get_extent_t *get_extent);
240int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
241int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); 233int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
242int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); 234int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
243void set_page_extent_mapped(struct page *page); 235void set_page_extent_mapped(struct page *page);
244 236
245struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, 237struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
246 u64 start, unsigned long len, 238 u64 start, unsigned long len,
247 struct page *page0, 239 struct page *page0);
248 gfp_t mask);
249struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 240struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
250 u64 start, unsigned long len, 241 u64 start, unsigned long len);
251 gfp_t mask);
252void free_extent_buffer(struct extent_buffer *eb); 242void free_extent_buffer(struct extent_buffer *eb);
253int read_extent_buffer_pages(struct extent_io_tree *tree, 243int read_extent_buffer_pages(struct extent_io_tree *tree,
254 struct extent_buffer *eb, u64 start, int wait, 244 struct extent_buffer *eb, u64 start, int wait,
@@ -276,16 +266,11 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
276 unsigned long src_offset, unsigned long len); 266 unsigned long src_offset, unsigned long len);
277void memset_extent_buffer(struct extent_buffer *eb, char c, 267void memset_extent_buffer(struct extent_buffer *eb, char c,
278 unsigned long start, unsigned long len); 268 unsigned long start, unsigned long len);
279int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
280 struct extent_buffer *eb);
281int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
282int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); 269int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
283int clear_extent_buffer_dirty(struct extent_io_tree *tree, 270int clear_extent_buffer_dirty(struct extent_io_tree *tree,
284 struct extent_buffer *eb); 271 struct extent_buffer *eb);
285int set_extent_buffer_dirty(struct extent_io_tree *tree, 272int set_extent_buffer_dirty(struct extent_io_tree *tree,
286 struct extent_buffer *eb); 273 struct extent_buffer *eb);
287int test_extent_buffer_dirty(struct extent_io_tree *tree,
288 struct extent_buffer *eb);
289int set_extent_buffer_uptodate(struct extent_io_tree *tree, 274int set_extent_buffer_uptodate(struct extent_io_tree *tree,
290 struct extent_buffer *eb); 275 struct extent_buffer *eb);
291int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 276int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -303,11 +288,13 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
303 unsigned long *map_start, 288 unsigned long *map_start,
304 unsigned long *map_len, int km); 289 unsigned long *map_len, int km);
305void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); 290void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
306int release_extent_buffer_tail_pages(struct extent_buffer *eb);
307int extent_range_uptodate(struct extent_io_tree *tree, 291int extent_range_uptodate(struct extent_io_tree *tree,
308 u64 start, u64 end); 292 u64 start, u64 end);
309int extent_clear_unlock_delalloc(struct inode *inode, 293int extent_clear_unlock_delalloc(struct inode *inode,
310 struct extent_io_tree *tree, 294 struct extent_io_tree *tree,
311 u64 start, u64 end, struct page *locked_page, 295 u64 start, u64 end, struct page *locked_page,
312 unsigned long op); 296 unsigned long op);
297struct bio *
298btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
299 gfp_t gfp_flags);
313#endif 300#endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 454ca52d6451..2d0410344ea3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/hardirq.h> 5#include <linux/hardirq.h>
6#include "ctree.h"
6#include "extent_map.h" 7#include "extent_map.h"
7 8
8 9
@@ -27,12 +28,11 @@ void extent_map_exit(void)
27/** 28/**
28 * extent_map_tree_init - initialize extent map tree 29 * extent_map_tree_init - initialize extent map tree
29 * @tree: tree to initialize 30 * @tree: tree to initialize
30 * @mask: flags for memory allocations during tree operations
31 * 31 *
32 * Initialize the extent tree @tree. Should be called for each new inode 32 * Initialize the extent tree @tree. Should be called for each new inode
33 * or other user of the extent_map interface. 33 * or other user of the extent_map interface.
34 */ 34 */
35void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) 35void extent_map_tree_init(struct extent_map_tree *tree)
36{ 36{
37 tree->map = RB_ROOT; 37 tree->map = RB_ROOT;
38 rwlock_init(&tree->lock); 38 rwlock_init(&tree->lock);
@@ -40,20 +40,20 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
40 40
41/** 41/**
42 * alloc_extent_map - allocate new extent map structure 42 * alloc_extent_map - allocate new extent map structure
43 * @mask: memory allocation flags
44 * 43 *
45 * Allocate a new extent_map structure. The new structure is 44 * Allocate a new extent_map structure. The new structure is
46 * returned with a reference count of one and needs to be 45 * returned with a reference count of one and needs to be
47 * freed using free_extent_map() 46 * freed using free_extent_map()
48 */ 47 */
49struct extent_map *alloc_extent_map(gfp_t mask) 48struct extent_map *alloc_extent_map(void)
50{ 49{
51 struct extent_map *em; 50 struct extent_map *em;
52 em = kmem_cache_alloc(extent_map_cache, mask); 51 em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
53 if (!em || IS_ERR(em)) 52 if (!em)
54 return em; 53 return NULL;
55 em->in_tree = 0; 54 em->in_tree = 0;
56 em->flags = 0; 55 em->flags = 0;
56 em->compress_type = BTRFS_COMPRESS_NONE;
57 atomic_set(&em->refs, 1); 57 atomic_set(&em->refs, 1);
58 return em; 58 return em;
59} 59}
@@ -241,7 +241,7 @@ out:
241 * Insert @em into @tree or perform a simple forward/backward merge with 241 * Insert @em into @tree or perform a simple forward/backward merge with
242 * existing mappings. The extent_map struct passed in will be inserted 242 * existing mappings. The extent_map struct passed in will be inserted
243 * into the tree directly, with an additional reference taken, or a 243 * into the tree directly, with an additional reference taken, or a
244 * reference dropped if the merge attempt was successfull. 244 * reference dropped if the merge attempt was successful.
245 */ 245 */
246int add_extent_mapping(struct extent_map_tree *tree, 246int add_extent_mapping(struct extent_map_tree *tree,
247 struct extent_map *em) 247 struct extent_map *em)
@@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
335 goto out; 335 goto out;
336 } 336 }
337 if (IS_ERR(rb_node)) { 337 if (IS_ERR(rb_node)) {
338 em = ERR_PTR(PTR_ERR(rb_node)); 338 em = ERR_CAST(rb_node);
339 goto out; 339 goto out;
340 } 340 }
341 em = rb_entry(rb_node, struct extent_map, rb_node); 341 em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
384 goto out; 384 goto out;
385 } 385 }
386 if (IS_ERR(rb_node)) { 386 if (IS_ERR(rb_node)) {
387 em = ERR_PTR(PTR_ERR(rb_node)); 387 em = ERR_CAST(rb_node);
388 goto out; 388 goto out;
389 } 389 }
390 em = rb_entry(rb_node, struct extent_map, rb_node); 390 em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..33a7890b1f40 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
26 unsigned long flags; 26 unsigned long flags;
27 struct block_device *bdev; 27 struct block_device *bdev;
28 atomic_t refs; 28 atomic_t refs;
29 int in_tree; 29 unsigned int in_tree:1;
30 unsigned int compress_type:4;
30}; 31};
31 32
32struct extent_map_tree { 33struct extent_map_tree {
@@ -48,14 +49,14 @@ static inline u64 extent_map_block_end(struct extent_map *em)
48 return em->block_start + em->block_len; 49 return em->block_start + em->block_len;
49} 50}
50 51
51void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); 52void extent_map_tree_init(struct extent_map_tree *tree);
52struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, 53struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
53 u64 start, u64 len); 54 u64 start, u64 len);
54int add_extent_mapping(struct extent_map_tree *tree, 55int add_extent_mapping(struct extent_map_tree *tree,
55 struct extent_map *em); 56 struct extent_map *em);
56int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); 57int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
57 58
58struct extent_map *alloc_extent_map(gfp_t mask); 59struct extent_map *alloc_extent_map(void);
59void free_extent_map(struct extent_map *em); 60void free_extent_map(struct extent_map *em);
60int __init extent_map_init(void); 61int __init extent_map_init(void);
61void extent_map_exit(void); 62void extent_map_exit(void);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..90d4ee52cd45 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
48 struct extent_buffer *leaf; 48 struct extent_buffer *leaf;
49 49
50 path = btrfs_alloc_path(); 50 path = btrfs_alloc_path();
51 BUG_ON(!path); 51 if (!path)
52 return -ENOMEM;
52 file_key.objectid = objectid; 53 file_key.objectid = objectid;
53 file_key.offset = pos; 54 file_key.offset = pos;
54 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); 55 btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
@@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
169 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 170 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
170 171
171 path = btrfs_alloc_path(); 172 path = btrfs_alloc_path();
173 if (!path)
174 return -ENOMEM;
172 if (bio->bi_size > PAGE_CACHE_SIZE * 8) 175 if (bio->bi_size > PAGE_CACHE_SIZE * 8)
173 path->reada = 2; 176 path->reada = 2;
174 177
@@ -190,7 +193,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
190 u32 item_size; 193 u32 item_size;
191 194
192 if (item) 195 if (item)
193 btrfs_release_path(root, path); 196 btrfs_release_path(path);
194 item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, 197 item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
195 path, disk_bytenr, 0); 198 path, disk_bytenr, 0);
196 if (IS_ERR(item)) { 199 if (IS_ERR(item)) {
@@ -205,12 +208,13 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
205 EXTENT_NODATASUM, GFP_NOFS); 208 EXTENT_NODATASUM, GFP_NOFS);
206 } else { 209 } else {
207 printk(KERN_INFO "btrfs no csum found " 210 printk(KERN_INFO "btrfs no csum found "
208 "for inode %lu start %llu\n", 211 "for inode %llu start %llu\n",
209 inode->i_ino, 212 (unsigned long long)
213 btrfs_ino(inode),
210 (unsigned long long)offset); 214 (unsigned long long)offset);
211 } 215 }
212 item = NULL; 216 item = NULL;
213 btrfs_release_path(root, path); 217 btrfs_release_path(path);
214 goto found; 218 goto found;
215 } 219 }
216 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 220 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
@@ -263,7 +267,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
263} 267}
264 268
265int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, 269int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
266 struct list_head *list) 270 struct list_head *list, int search_commit)
267{ 271{
268 struct btrfs_key key; 272 struct btrfs_key key;
269 struct btrfs_path *path; 273 struct btrfs_path *path;
@@ -280,6 +284,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
280 path = btrfs_alloc_path(); 284 path = btrfs_alloc_path();
281 BUG_ON(!path); 285 BUG_ON(!path);
282 286
287 if (search_commit) {
288 path->skip_locking = 1;
289 path->reada = 2;
290 path->search_commit_root = 1;
291 }
292
283 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 293 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
284 key.offset = start; 294 key.offset = start;
285 key.type = BTRFS_EXTENT_CSUM_KEY; 295 key.type = BTRFS_EXTENT_CSUM_KEY;
@@ -492,7 +502,6 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u32 new_size = (bytenr - key->offset) >> blocksize_bits; 502 u32 new_size = (bytenr - key->offset) >> blocksize_bits;
493 new_size *= csum_size; 503 new_size *= csum_size;
494 ret = btrfs_truncate_item(trans, root, path, new_size, 1); 504 ret = btrfs_truncate_item(trans, root, path, new_size, 1);
495 BUG_ON(ret);
496 } else if (key->offset >= bytenr && csum_end > end_byte && 505 } else if (key->offset >= bytenr && csum_end > end_byte &&
497 end_byte > key->offset) { 506 end_byte > key->offset) {
498 /* 507 /*
@@ -505,7 +514,6 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
505 new_size *= csum_size; 514 new_size *= csum_size;
506 515
507 ret = btrfs_truncate_item(trans, root, path, new_size, 0); 516 ret = btrfs_truncate_item(trans, root, path, new_size, 0);
508 BUG_ON(ret);
509 517
510 key->offset = end_byte; 518 key->offset = end_byte;
511 ret = btrfs_set_item_key_safe(trans, root, path, key); 519 ret = btrfs_set_item_key_safe(trans, root, path, key);
@@ -536,6 +544,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
536 root = root->fs_info->csum_root; 544 root = root->fs_info->csum_root;
537 545
538 path = btrfs_alloc_path(); 546 path = btrfs_alloc_path();
547 if (!path)
548 return -ENOMEM;
539 549
540 while (1) { 550 while (1) {
541 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 551 key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -546,9 +556,12 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
546 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 556 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
547 if (ret > 0) { 557 if (ret > 0) {
548 if (path->slots[0] == 0) 558 if (path->slots[0] == 0)
549 goto out; 559 break;
550 path->slots[0]--; 560 path->slots[0]--;
561 } else if (ret < 0) {
562 break;
551 } 563 }
564
552 leaf = path->nodes[0]; 565 leaf = path->nodes[0];
553 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 566 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
554 567
@@ -571,7 +584,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
571 /* delete the entire item, it is inside our range */ 584 /* delete the entire item, it is inside our range */
572 if (key.offset >= bytenr && csum_end <= end_byte) { 585 if (key.offset >= bytenr && csum_end <= end_byte) {
573 ret = btrfs_del_item(trans, root, path); 586 ret = btrfs_del_item(trans, root, path);
574 BUG_ON(ret); 587 if (ret)
588 goto out;
575 if (key.offset == bytenr) 589 if (key.offset == bytenr)
576 break; 590 break;
577 } else if (key.offset < bytenr && csum_end > end_byte) { 591 } else if (key.offset < bytenr && csum_end > end_byte) {
@@ -623,11 +637,12 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
623 if (key.offset < bytenr) 637 if (key.offset < bytenr)
624 break; 638 break;
625 } 639 }
626 btrfs_release_path(root, path); 640 btrfs_release_path(path);
627 } 641 }
642 ret = 0;
628out: 643out:
629 btrfs_free_path(path); 644 btrfs_free_path(path);
630 return 0; 645 return ret;
631} 646}
632 647
633int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, 648int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
@@ -714,7 +729,7 @@ again:
714 * at this point, we know the tree has an item, but it isn't big 729 * at this point, we know the tree has an item, but it isn't big
715 * enough yet to put our csum in. Grow it 730 * enough yet to put our csum in. Grow it
716 */ 731 */
717 btrfs_release_path(root, path); 732 btrfs_release_path(path);
718 ret = btrfs_search_slot(trans, root, &file_key, path, 733 ret = btrfs_search_slot(trans, root, &file_key, path,
719 csum_size, 1); 734 csum_size, 1);
720 if (ret < 0) 735 if (ret < 0)
@@ -753,12 +768,11 @@ again:
753 goto insert; 768 goto insert;
754 769
755 ret = btrfs_extend_item(trans, root, path, diff); 770 ret = btrfs_extend_item(trans, root, path, diff);
756 BUG_ON(ret);
757 goto csum; 771 goto csum;
758 } 772 }
759 773
760insert: 774insert:
761 btrfs_release_path(root, path); 775 btrfs_release_path(path);
762 csum_offset = 0; 776 csum_offset = 0;
763 if (found_next) { 777 if (found_next) {
764 u64 tmp = total_bytes + root->sectorsize; 778 u64 tmp = total_bytes + root->sectorsize;
@@ -842,7 +856,7 @@ next_sector:
842 } 856 }
843 btrfs_mark_buffer_dirty(path->nodes[0]); 857 btrfs_mark_buffer_dirty(path->nodes[0]);
844 if (total_bytes < sums->len) { 858 if (total_bytes < sums->len) {
845 btrfs_release_path(root, path); 859 btrfs_release_path(path);
846 cond_resched(); 860 cond_resched();
847 goto again; 861 goto again;
848 } 862 }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df082..fa4ef18b66b1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
24#include <linux/string.h> 24#include <linux/string.h>
25#include <linux/backing-dev.h> 25#include <linux/backing-dev.h>
26#include <linux/mpage.h> 26#include <linux/mpage.h>
27#include <linux/falloc.h>
27#include <linux/swap.h> 28#include <linux/swap.h>
28#include <linux/writeback.h> 29#include <linux/writeback.h>
29#include <linux/statfs.h> 30#include <linux/statfs.h>
@@ -39,16 +40,274 @@
39#include "locking.h" 40#include "locking.h"
40#include "compat.h" 41#include "compat.h"
41 42
43/*
44 * when auto defrag is enabled we
45 * queue up these defrag structs to remember which
46 * inodes need defragging passes
47 */
48struct inode_defrag {
49 struct rb_node rb_node;
50 /* objectid */
51 u64 ino;
52 /*
53 * transid where the defrag was added, we search for
54 * extents newer than this
55 */
56 u64 transid;
57
58 /* root objectid */
59 u64 root;
60
61 /* last offset we were able to defrag */
62 u64 last_offset;
63
64 /* if we've wrapped around back to zero once already */
65 int cycled;
66};
67
68/* pop a record for an inode into the defrag tree. The lock
69 * must be held already
70 *
71 * If you're inserting a record for an older transid than an
72 * existing record, the transid already in the tree is lowered
73 *
74 * If an existing record is found the defrag item you
75 * pass in is freed
76 */
77static int __btrfs_add_inode_defrag(struct inode *inode,
78 struct inode_defrag *defrag)
79{
80 struct btrfs_root *root = BTRFS_I(inode)->root;
81 struct inode_defrag *entry;
82 struct rb_node **p;
83 struct rb_node *parent = NULL;
84
85 p = &root->fs_info->defrag_inodes.rb_node;
86 while (*p) {
87 parent = *p;
88 entry = rb_entry(parent, struct inode_defrag, rb_node);
89
90 if (defrag->ino < entry->ino)
91 p = &parent->rb_left;
92 else if (defrag->ino > entry->ino)
93 p = &parent->rb_right;
94 else {
95 /* if we're reinserting an entry for
96 * an old defrag run, make sure to
97 * lower the transid of our existing record
98 */
99 if (defrag->transid < entry->transid)
100 entry->transid = defrag->transid;
101 if (defrag->last_offset > entry->last_offset)
102 entry->last_offset = defrag->last_offset;
103 goto exists;
104 }
105 }
106 BTRFS_I(inode)->in_defrag = 1;
107 rb_link_node(&defrag->rb_node, parent, p);
108 rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
109 return 0;
110
111exists:
112 kfree(defrag);
113 return 0;
114
115}
116
117/*
118 * insert a defrag record for this inode if auto defrag is
119 * enabled
120 */
121int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
122 struct inode *inode)
123{
124 struct btrfs_root *root = BTRFS_I(inode)->root;
125 struct inode_defrag *defrag;
126 int ret = 0;
127 u64 transid;
128
129 if (!btrfs_test_opt(root, AUTO_DEFRAG))
130 return 0;
131
132 if (btrfs_fs_closing(root->fs_info))
133 return 0;
134
135 if (BTRFS_I(inode)->in_defrag)
136 return 0;
137
138 if (trans)
139 transid = trans->transid;
140 else
141 transid = BTRFS_I(inode)->root->last_trans;
142
143 defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
144 if (!defrag)
145 return -ENOMEM;
146
147 defrag->ino = btrfs_ino(inode);
148 defrag->transid = transid;
149 defrag->root = root->root_key.objectid;
150
151 spin_lock(&root->fs_info->defrag_inodes_lock);
152 if (!BTRFS_I(inode)->in_defrag)
153 ret = __btrfs_add_inode_defrag(inode, defrag);
154 spin_unlock(&root->fs_info->defrag_inodes_lock);
155 return ret;
156}
157
158/*
159 * must be called with the defrag_inodes lock held
160 */
161struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
162 struct rb_node **next)
163{
164 struct inode_defrag *entry = NULL;
165 struct rb_node *p;
166 struct rb_node *parent = NULL;
167
168 p = info->defrag_inodes.rb_node;
169 while (p) {
170 parent = p;
171 entry = rb_entry(parent, struct inode_defrag, rb_node);
172
173 if (ino < entry->ino)
174 p = parent->rb_left;
175 else if (ino > entry->ino)
176 p = parent->rb_right;
177 else
178 return entry;
179 }
180
181 if (next) {
182 while (parent && ino > entry->ino) {
183 parent = rb_next(parent);
184 entry = rb_entry(parent, struct inode_defrag, rb_node);
185 }
186 *next = parent;
187 }
188 return NULL;
189}
190
191/*
192 * run through the list of inodes in the FS that need
193 * defragging
194 */
195int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
196{
197 struct inode_defrag *defrag;
198 struct btrfs_root *inode_root;
199 struct inode *inode;
200 struct rb_node *n;
201 struct btrfs_key key;
202 struct btrfs_ioctl_defrag_range_args range;
203 u64 first_ino = 0;
204 int num_defrag;
205 int defrag_batch = 1024;
206
207 memset(&range, 0, sizeof(range));
208 range.len = (u64)-1;
209
210 atomic_inc(&fs_info->defrag_running);
211 spin_lock(&fs_info->defrag_inodes_lock);
212 while(1) {
213 n = NULL;
214
215 /* find an inode to defrag */
216 defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
217 if (!defrag) {
218 if (n)
219 defrag = rb_entry(n, struct inode_defrag, rb_node);
220 else if (first_ino) {
221 first_ino = 0;
222 continue;
223 } else {
224 break;
225 }
226 }
227
228 /* remove it from the rbtree */
229 first_ino = defrag->ino + 1;
230 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
231
232 if (btrfs_fs_closing(fs_info))
233 goto next_free;
234
235 spin_unlock(&fs_info->defrag_inodes_lock);
236
237 /* get the inode */
238 key.objectid = defrag->root;
239 btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
240 key.offset = (u64)-1;
241 inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
242 if (IS_ERR(inode_root))
243 goto next;
244
245 key.objectid = defrag->ino;
246 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
247 key.offset = 0;
248
249 inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
250 if (IS_ERR(inode))
251 goto next;
252
253 /* do a chunk of defrag */
254 BTRFS_I(inode)->in_defrag = 0;
255 range.start = defrag->last_offset;
256 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
257 defrag_batch);
258 /*
259 * if we filled the whole defrag batch, there
260 * must be more work to do. Queue this defrag
261 * again
262 */
263 if (num_defrag == defrag_batch) {
264 defrag->last_offset = range.start;
265 __btrfs_add_inode_defrag(inode, defrag);
266 /*
267 * we don't want to kfree defrag, we added it back to
268 * the rbtree
269 */
270 defrag = NULL;
271 } else if (defrag->last_offset && !defrag->cycled) {
272 /*
273 * we didn't fill our defrag batch, but
274 * we didn't start at zero. Make sure we loop
275 * around to the start of the file.
276 */
277 defrag->last_offset = 0;
278 defrag->cycled = 1;
279 __btrfs_add_inode_defrag(inode, defrag);
280 defrag = NULL;
281 }
282
283 iput(inode);
284next:
285 spin_lock(&fs_info->defrag_inodes_lock);
286next_free:
287 kfree(defrag);
288 }
289 spin_unlock(&fs_info->defrag_inodes_lock);
290
291 atomic_dec(&fs_info->defrag_running);
292
293 /*
294 * during unmount, we use the transaction_wait queue to
295 * wait for the defragger to stop
296 */
297 wake_up(&fs_info->transaction_wait);
298 return 0;
299}
42 300
43/* simple helper to fault in pages and copy. This should go away 301/* simple helper to fault in pages and copy. This should go away
44 * and be replaced with calls into generic code. 302 * and be replaced with calls into generic code.
45 */ 303 */
46static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, 304static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47 int write_bytes, 305 size_t write_bytes,
48 struct page **prepared_pages, 306 struct page **prepared_pages,
49 struct iov_iter *i) 307 struct iov_iter *i)
50{ 308{
51 size_t copied; 309 size_t copied = 0;
310 size_t total_copied = 0;
52 int pg = 0; 311 int pg = 0;
53 int offset = pos & (PAGE_CACHE_SIZE - 1); 312 int offset = pos & (PAGE_CACHE_SIZE - 1);
54 313
@@ -56,23 +315,38 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
56 size_t count = min_t(size_t, 315 size_t count = min_t(size_t,
57 PAGE_CACHE_SIZE - offset, write_bytes); 316 PAGE_CACHE_SIZE - offset, write_bytes);
58 struct page *page = prepared_pages[pg]; 317 struct page *page = prepared_pages[pg];
59again: 318 /*
60 if (unlikely(iov_iter_fault_in_readable(i, count))) 319 * Copy data from userspace to the current page
61 return -EFAULT; 320 *
62 321 * Disable pagefault to avoid recursive lock since
63 /* Copy data from userspace to the current page */ 322 * the pages are already locked
64 copied = iov_iter_copy_from_user(page, i, offset, count); 323 */
324 pagefault_disable();
325 copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
326 pagefault_enable();
65 327
66 /* Flush processor's dcache for this page */ 328 /* Flush processor's dcache for this page */
67 flush_dcache_page(page); 329 flush_dcache_page(page);
330
331 /*
332 * if we get a partial write, we can end up with
333 * partially up to date pages. These add
334 * a lot of complexity, so make sure they don't
335 * happen by forcing this copy to be retried.
336 *
337 * The rest of the btrfs_file_write code will fall
338 * back to page at a time copies after we return 0.
339 */
340 if (!PageUptodate(page) && copied < count)
341 copied = 0;
342
68 iov_iter_advance(i, copied); 343 iov_iter_advance(i, copied);
69 write_bytes -= copied; 344 write_bytes -= copied;
345 total_copied += copied;
70 346
71 if (unlikely(copied == 0)) { 347 /* Return to btrfs_file_aio_write to fault page */
72 count = min_t(size_t, PAGE_CACHE_SIZE - offset, 348 if (unlikely(copied == 0))
73 iov_iter_single_seg_count(i)); 349 break;
74 goto again;
75 }
76 350
77 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { 351 if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 offset += copied; 352 offset += copied;
@@ -81,18 +355,16 @@ again:
81 offset = 0; 355 offset = 0;
82 } 356 }
83 } 357 }
84 return 0; 358 return total_copied;
85} 359}
86 360
87/* 361/*
88 * unlocks pages after btrfs_file_write is done with them 362 * unlocks pages after btrfs_file_write is done with them
89 */ 363 */
90static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) 364void btrfs_drop_pages(struct page **pages, size_t num_pages)
91{ 365{
92 size_t i; 366 size_t i;
93 for (i = 0; i < num_pages; i++) { 367 for (i = 0; i < num_pages; i++) {
94 if (!pages[i])
95 break;
96 /* page checked is some magic around finding pages that 368 /* page checked is some magic around finding pages that
97 * have been modified without going through btrfs_set_page_dirty 369 * have been modified without going through btrfs_set_page_dirty
98 * clear it here 370 * clear it here
@@ -112,17 +384,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
112 * this also makes the decision about creating an inline extent vs 384 * this also makes the decision about creating an inline extent vs
113 * doing real data extents, marking pages dirty and delalloc as required. 385 * doing real data extents, marking pages dirty and delalloc as required.
114 */ 386 */
115static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, 387int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
116 struct btrfs_root *root, 388 struct page **pages, size_t num_pages,
117 struct file *file, 389 loff_t pos, size_t write_bytes,
118 struct page **pages, 390 struct extent_state **cached)
119 size_t num_pages,
120 loff_t pos,
121 size_t write_bytes)
122{ 391{
123 int err = 0; 392 int err = 0;
124 int i; 393 int i;
125 struct inode *inode = fdentry(file)->d_inode;
126 u64 num_bytes; 394 u64 num_bytes;
127 u64 start_pos; 395 u64 start_pos;
128 u64 end_of_last_block; 396 u64 end_of_last_block;
@@ -135,8 +403,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
135 403
136 end_of_last_block = start_pos + num_bytes - 1; 404 end_of_last_block = start_pos + num_bytes - 1;
137 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, 405 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
138 NULL); 406 cached);
139 BUG_ON(err); 407 if (err)
408 return err;
140 409
141 for (i = 0; i < num_pages; i++) { 410 for (i = 0; i < num_pages; i++) {
142 struct page *p = pages[i]; 411 struct page *p = pages[i];
@@ -144,13 +413,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
144 ClearPageChecked(p); 413 ClearPageChecked(p);
145 set_page_dirty(p); 414 set_page_dirty(p);
146 } 415 }
147 if (end_pos > isize) { 416
417 /*
418 * we've only changed i_size in ram, and we haven't updated
419 * the disk i_size. There is no need to log the inode
420 * at this time.
421 */
422 if (end_pos > isize)
148 i_size_write(inode, end_pos); 423 i_size_write(inode, end_pos);
149 /* we've only changed i_size in ram, and we haven't updated
150 * the disk i_size. There is no need to log the inode
151 * at this time.
152 */
153 }
154 return 0; 424 return 0;
155} 425}
156 426
@@ -178,9 +448,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
178 } 448 }
179 while (1) { 449 while (1) {
180 if (!split) 450 if (!split)
181 split = alloc_extent_map(GFP_NOFS); 451 split = alloc_extent_map();
182 if (!split2) 452 if (!split2)
183 split2 = alloc_extent_map(GFP_NOFS); 453 split2 = alloc_extent_map();
454 BUG_ON(!split || !split2);
184 455
185 write_lock(&em_tree->lock); 456 write_lock(&em_tree->lock);
186 em = lookup_extent_mapping(em_tree, start, len); 457 em = lookup_extent_mapping(em_tree, start, len);
@@ -220,6 +491,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
220 491
221 split->bdev = em->bdev; 492 split->bdev = em->bdev;
222 split->flags = flags; 493 split->flags = flags;
494 split->compress_type = em->compress_type;
223 ret = add_extent_mapping(em_tree, split); 495 ret = add_extent_mapping(em_tree, split);
224 BUG_ON(ret); 496 BUG_ON(ret);
225 free_extent_map(split); 497 free_extent_map(split);
@@ -234,6 +506,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
234 split->len = em->start + em->len - (start + len); 506 split->len = em->start + em->len - (start + len);
235 split->bdev = em->bdev; 507 split->bdev = em->bdev;
236 split->flags = flags; 508 split->flags = flags;
509 split->compress_type = em->compress_type;
237 510
238 if (compressed) { 511 if (compressed) {
239 split->block_len = em->block_len; 512 split->block_len = em->block_len;
@@ -282,6 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
282 struct btrfs_path *path; 555 struct btrfs_path *path;
283 struct btrfs_key key; 556 struct btrfs_key key;
284 struct btrfs_key new_key; 557 struct btrfs_key new_key;
558 u64 ino = btrfs_ino(inode);
285 u64 search_start = start; 559 u64 search_start = start;
286 u64 disk_bytenr = 0; 560 u64 disk_bytenr = 0;
287 u64 num_bytes = 0; 561 u64 num_bytes = 0;
@@ -302,14 +576,14 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
302 576
303 while (1) { 577 while (1) {
304 recow = 0; 578 recow = 0;
305 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 579 ret = btrfs_lookup_file_extent(trans, root, path, ino,
306 search_start, -1); 580 search_start, -1);
307 if (ret < 0) 581 if (ret < 0)
308 break; 582 break;
309 if (ret > 0 && path->slots[0] > 0 && search_start == start) { 583 if (ret > 0 && path->slots[0] > 0 && search_start == start) {
310 leaf = path->nodes[0]; 584 leaf = path->nodes[0];
311 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); 585 btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
312 if (key.objectid == inode->i_ino && 586 if (key.objectid == ino &&
313 key.type == BTRFS_EXTENT_DATA_KEY) 587 key.type == BTRFS_EXTENT_DATA_KEY)
314 path->slots[0]--; 588 path->slots[0]--;
315 } 589 }
@@ -330,7 +604,7 @@ next_slot:
330 } 604 }
331 605
332 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 606 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
333 if (key.objectid > inode->i_ino || 607 if (key.objectid > ino ||
334 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) 608 key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
335 break; 609 break;
336 610
@@ -360,7 +634,7 @@ next_slot:
360 634
361 search_start = max(key.offset, start); 635 search_start = max(key.offset, start);
362 if (recow) { 636 if (recow) {
363 btrfs_release_path(root, path); 637 btrfs_release_path(path);
364 continue; 638 continue;
365 } 639 }
366 640
@@ -377,7 +651,7 @@ next_slot:
377 ret = btrfs_duplicate_item(trans, root, path, 651 ret = btrfs_duplicate_item(trans, root, path,
378 &new_key); 652 &new_key);
379 if (ret == -EAGAIN) { 653 if (ret == -EAGAIN) {
380 btrfs_release_path(root, path); 654 btrfs_release_path(path);
381 continue; 655 continue;
382 } 656 }
383 if (ret < 0) 657 if (ret < 0)
@@ -500,7 +774,7 @@ next_slot:
500 del_nr = 0; 774 del_nr = 0;
501 del_slot = 0; 775 del_slot = 0;
502 776
503 btrfs_release_path(root, path); 777 btrfs_release_path(path);
504 continue; 778 continue;
505 } 779 }
506 780
@@ -576,6 +850,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
576 int del_slot = 0; 850 int del_slot = 0;
577 int recow; 851 int recow;
578 int ret; 852 int ret;
853 u64 ino = btrfs_ino(inode);
579 854
580 btrfs_drop_extent_cache(inode, start, end - 1, 0); 855 btrfs_drop_extent_cache(inode, start, end - 1, 0);
581 856
@@ -584,18 +859,19 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
584again: 859again:
585 recow = 0; 860 recow = 0;
586 split = start; 861 split = start;
587 key.objectid = inode->i_ino; 862 key.objectid = ino;
588 key.type = BTRFS_EXTENT_DATA_KEY; 863 key.type = BTRFS_EXTENT_DATA_KEY;
589 key.offset = split; 864 key.offset = split;
590 865
591 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 866 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
867 if (ret < 0)
868 goto out;
592 if (ret > 0 && path->slots[0] > 0) 869 if (ret > 0 && path->slots[0] > 0)
593 path->slots[0]--; 870 path->slots[0]--;
594 871
595 leaf = path->nodes[0]; 872 leaf = path->nodes[0];
596 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 873 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
597 BUG_ON(key.objectid != inode->i_ino || 874 BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
598 key.type != BTRFS_EXTENT_DATA_KEY);
599 fi = btrfs_item_ptr(leaf, path->slots[0], 875 fi = btrfs_item_ptr(leaf, path->slots[0],
600 struct btrfs_file_extent_item); 876 struct btrfs_file_extent_item);
601 BUG_ON(btrfs_file_extent_type(leaf, fi) != 877 BUG_ON(btrfs_file_extent_type(leaf, fi) !=
@@ -612,7 +888,7 @@ again:
612 other_start = 0; 888 other_start = 0;
613 other_end = start; 889 other_end = start;
614 if (extent_mergeable(leaf, path->slots[0] - 1, 890 if (extent_mergeable(leaf, path->slots[0] - 1,
615 inode->i_ino, bytenr, orig_offset, 891 ino, bytenr, orig_offset,
616 &other_start, &other_end)) { 892 &other_start, &other_end)) {
617 new_key.offset = end; 893 new_key.offset = end;
618 btrfs_set_item_key_safe(trans, root, path, &new_key); 894 btrfs_set_item_key_safe(trans, root, path, &new_key);
@@ -635,7 +911,7 @@ again:
635 other_start = end; 911 other_start = end;
636 other_end = 0; 912 other_end = 0;
637 if (extent_mergeable(leaf, path->slots[0] + 1, 913 if (extent_mergeable(leaf, path->slots[0] + 1,
638 inode->i_ino, bytenr, orig_offset, 914 ino, bytenr, orig_offset,
639 &other_start, &other_end)) { 915 &other_start, &other_end)) {
640 fi = btrfs_item_ptr(leaf, path->slots[0], 916 fi = btrfs_item_ptr(leaf, path->slots[0],
641 struct btrfs_file_extent_item); 917 struct btrfs_file_extent_item);
@@ -663,7 +939,7 @@ again:
663 new_key.offset = split; 939 new_key.offset = split;
664 ret = btrfs_duplicate_item(trans, root, path, &new_key); 940 ret = btrfs_duplicate_item(trans, root, path, &new_key);
665 if (ret == -EAGAIN) { 941 if (ret == -EAGAIN) {
666 btrfs_release_path(root, path); 942 btrfs_release_path(path);
667 goto again; 943 goto again;
668 } 944 }
669 BUG_ON(ret < 0); 945 BUG_ON(ret < 0);
@@ -684,7 +960,7 @@ again:
684 960
685 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, 961 ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
686 root->root_key.objectid, 962 root->root_key.objectid,
687 inode->i_ino, orig_offset); 963 ino, orig_offset);
688 BUG_ON(ret); 964 BUG_ON(ret);
689 965
690 if (split == start) { 966 if (split == start) {
@@ -700,10 +976,10 @@ again:
700 other_start = end; 976 other_start = end;
701 other_end = 0; 977 other_end = 0;
702 if (extent_mergeable(leaf, path->slots[0] + 1, 978 if (extent_mergeable(leaf, path->slots[0] + 1,
703 inode->i_ino, bytenr, orig_offset, 979 ino, bytenr, orig_offset,
704 &other_start, &other_end)) { 980 &other_start, &other_end)) {
705 if (recow) { 981 if (recow) {
706 btrfs_release_path(root, path); 982 btrfs_release_path(path);
707 goto again; 983 goto again;
708 } 984 }
709 extent_end = other_end; 985 extent_end = other_end;
@@ -711,16 +987,16 @@ again:
711 del_nr++; 987 del_nr++;
712 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 988 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
713 0, root->root_key.objectid, 989 0, root->root_key.objectid,
714 inode->i_ino, orig_offset); 990 ino, orig_offset);
715 BUG_ON(ret); 991 BUG_ON(ret);
716 } 992 }
717 other_start = 0; 993 other_start = 0;
718 other_end = start; 994 other_end = start;
719 if (extent_mergeable(leaf, path->slots[0] - 1, 995 if (extent_mergeable(leaf, path->slots[0] - 1,
720 inode->i_ino, bytenr, orig_offset, 996 ino, bytenr, orig_offset,
721 &other_start, &other_end)) { 997 &other_start, &other_end)) {
722 if (recow) { 998 if (recow) {
723 btrfs_release_path(root, path); 999 btrfs_release_path(path);
724 goto again; 1000 goto again;
725 } 1001 }
726 key.offset = other_start; 1002 key.offset = other_start;
@@ -728,7 +1004,7 @@ again:
728 del_nr++; 1004 del_nr++;
729 ret = btrfs_free_extent(trans, root, bytenr, num_bytes, 1005 ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
730 0, root->root_key.objectid, 1006 0, root->root_key.objectid,
731 inode->i_ino, orig_offset); 1007 ino, orig_offset);
732 BUG_ON(ret); 1008 BUG_ON(ret);
733 } 1009 }
734 if (del_nr == 0) { 1010 if (del_nr == 0) {
@@ -755,6 +1031,27 @@ out:
755} 1031}
756 1032
757/* 1033/*
1034 * on error we return an unlocked page and the error value
1035 * on success we return a locked page and 0
1036 */
1037static int prepare_uptodate_page(struct page *page, u64 pos)
1038{
1039 int ret = 0;
1040
1041 if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
1042 ret = btrfs_readpage(NULL, page);
1043 if (ret)
1044 return ret;
1045 lock_page(page);
1046 if (!PageUptodate(page)) {
1047 unlock_page(page);
1048 return -EIO;
1049 }
1050 }
1051 return 0;
1052}
1053
1054/*
758 * this gets pages into the page cache and locks them down, it also properly 1055 * this gets pages into the page cache and locks them down, it also properly
759 * waits for data=ordered extents to finish before allowing the pages to be 1056 * waits for data=ordered extents to finish before allowing the pages to be
760 * modified. 1057 * modified.
@@ -769,6 +1066,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
769 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1066 unsigned long index = pos >> PAGE_CACHE_SHIFT;
770 struct inode *inode = fdentry(file)->d_inode; 1067 struct inode *inode = fdentry(file)->d_inode;
771 int err = 0; 1068 int err = 0;
1069 int faili = 0;
772 u64 start_pos; 1070 u64 start_pos;
773 u64 last_pos; 1071 u64 last_pos;
774 1072
@@ -776,21 +1074,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
776 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; 1074 last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
777 1075
778 if (start_pos > inode->i_size) { 1076 if (start_pos > inode->i_size) {
779 err = btrfs_cont_expand(inode, start_pos); 1077 err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
780 if (err) 1078 if (err)
781 return err; 1079 return err;
782 } 1080 }
783 1081
784 memset(pages, 0, num_pages * sizeof(struct page *));
785again: 1082again:
786 for (i = 0; i < num_pages; i++) { 1083 for (i = 0; i < num_pages; i++) {
787 pages[i] = grab_cache_page(inode->i_mapping, index + i); 1084 pages[i] = grab_cache_page(inode->i_mapping, index + i);
788 if (!pages[i]) { 1085 if (!pages[i]) {
1086 faili = i - 1;
789 err = -ENOMEM; 1087 err = -ENOMEM;
790 BUG_ON(1); 1088 goto fail;
1089 }
1090
1091 if (i == 0)
1092 err = prepare_uptodate_page(pages[i], pos);
1093 if (i == num_pages - 1)
1094 err = prepare_uptodate_page(pages[i],
1095 pos + write_bytes);
1096 if (err) {
1097 page_cache_release(pages[i]);
1098 faili = i - 1;
1099 goto fail;
791 } 1100 }
792 wait_on_page_writeback(pages[i]); 1101 wait_on_page_writeback(pages[i]);
793 } 1102 }
1103 err = 0;
794 if (start_pos < inode->i_size) { 1104 if (start_pos < inode->i_size) {
795 struct btrfs_ordered_extent *ordered; 1105 struct btrfs_ordered_extent *ordered;
796 lock_extent_bits(&BTRFS_I(inode)->io_tree, 1106 lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -830,199 +1140,264 @@ again:
830 WARN_ON(!PageLocked(pages[i])); 1140 WARN_ON(!PageLocked(pages[i]));
831 } 1141 }
832 return 0; 1142 return 0;
1143fail:
1144 while (faili >= 0) {
1145 unlock_page(pages[faili]);
1146 page_cache_release(pages[faili]);
1147 faili--;
1148 }
1149 return err;
1150
833} 1151}
834 1152
835static ssize_t btrfs_file_aio_write(struct kiocb *iocb, 1153static noinline ssize_t __btrfs_buffered_write(struct file *file,
836 const struct iovec *iov, 1154 struct iov_iter *i,
837 unsigned long nr_segs, loff_t pos) 1155 loff_t pos)
838{ 1156{
839 struct file *file = iocb->ki_filp;
840 struct inode *inode = fdentry(file)->d_inode; 1157 struct inode *inode = fdentry(file)->d_inode;
841 struct btrfs_root *root = BTRFS_I(inode)->root; 1158 struct btrfs_root *root = BTRFS_I(inode)->root;
842 struct page *pinned[2];
843 struct page **pages = NULL; 1159 struct page **pages = NULL;
844 struct iov_iter i;
845 loff_t *ppos = &iocb->ki_pos;
846 loff_t start_pos;
847 ssize_t num_written = 0;
848 ssize_t err = 0;
849 size_t count;
850 size_t ocount;
851 int ret = 0;
852 int nrptrs;
853 unsigned long first_index; 1160 unsigned long first_index;
854 unsigned long last_index; 1161 unsigned long last_index;
855 int will_write; 1162 size_t num_written = 0;
856 int buffered = 0; 1163 int nrptrs;
1164 int ret = 0;
857 1165
858 will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) || 1166 nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
859 (file->f_flags & O_DIRECT)); 1167 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
1168 (sizeof(struct page *)));
1169 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1170 if (!pages)
1171 return -ENOMEM;
860 1172
861 pinned[0] = NULL; 1173 first_index = pos >> PAGE_CACHE_SHIFT;
862 pinned[1] = NULL; 1174 last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
863 1175
864 start_pos = pos; 1176 while (iov_iter_count(i) > 0) {
1177 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1178 size_t write_bytes = min(iov_iter_count(i),
1179 nrptrs * (size_t)PAGE_CACHE_SIZE -
1180 offset);
1181 size_t num_pages = (write_bytes + offset +
1182 PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1183 size_t dirty_pages;
1184 size_t copied;
865 1185
866 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); 1186 WARN_ON(num_pages > nrptrs);
867 1187
868 mutex_lock(&inode->i_mutex); 1188 /*
1189 * Fault pages before locking them in prepare_pages
1190 * to avoid recursive lock
1191 */
1192 if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1193 ret = -EFAULT;
1194 break;
1195 }
869 1196
870 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ); 1197 ret = btrfs_delalloc_reserve_space(inode,
871 if (err) 1198 num_pages << PAGE_CACHE_SHIFT);
872 goto out; 1199 if (ret)
873 count = ocount; 1200 break;
874 1201
875 current->backing_dev_info = inode->i_mapping->backing_dev_info; 1202 /*
876 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); 1203 * This is going to setup the pages array with the number of
877 if (err) 1204 * pages we want, so we don't really need to worry about the
878 goto out; 1205 * contents of pages from loop to loop
1206 */
1207 ret = prepare_pages(root, file, pages, num_pages,
1208 pos, first_index, last_index,
1209 write_bytes);
1210 if (ret) {
1211 btrfs_delalloc_release_space(inode,
1212 num_pages << PAGE_CACHE_SHIFT);
1213 break;
1214 }
879 1215
880 if (count == 0) 1216 copied = btrfs_copy_from_user(pos, num_pages,
881 goto out; 1217 write_bytes, pages, i);
882 1218
883 err = file_remove_suid(file); 1219 /*
884 if (err) 1220 * if we have trouble faulting in the pages, fall
885 goto out; 1221 * back to one page at a time
1222 */
1223 if (copied < write_bytes)
1224 nrptrs = 1;
886 1225
887 file_update_time(file); 1226 if (copied == 0)
888 BTRFS_I(inode)->sequence++; 1227 dirty_pages = 0;
1228 else
1229 dirty_pages = (copied + offset +
1230 PAGE_CACHE_SIZE - 1) >>
1231 PAGE_CACHE_SHIFT;
889 1232
890 if (unlikely(file->f_flags & O_DIRECT)) {
891 num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 pos, ppos, count,
893 ocount);
894 /* 1233 /*
895 * the generic O_DIRECT will update in-memory i_size after the 1234 * If we had a short copy we need to release the excess delaloc
896 * DIOs are done. But our endio handlers that update the on 1235 * bytes we reserved. We need to increment outstanding_extents
897 * disk i_size never update past the in memory i_size. So we 1236 * because btrfs_delalloc_release_space will decrement it, but
898 * need one more update here to catch any additions to the 1237 * we still have an outstanding extent for the chunk we actually
899 * file 1238 * managed to copy.
900 */ 1239 */
901 if (inode->i_size != BTRFS_I(inode)->disk_i_size) { 1240 if (num_pages > dirty_pages) {
902 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 1241 if (copied > 0)
903 mark_inode_dirty(inode); 1242 atomic_inc(
1243 &BTRFS_I(inode)->outstanding_extents);
1244 btrfs_delalloc_release_space(inode,
1245 (num_pages - dirty_pages) <<
1246 PAGE_CACHE_SHIFT);
904 } 1247 }
905 1248
906 if (num_written < 0) { 1249 if (copied > 0) {
907 ret = num_written; 1250 ret = btrfs_dirty_pages(root, inode, pages,
908 num_written = 0; 1251 dirty_pages, pos, copied,
909 goto out; 1252 NULL);
910 } else if (num_written == count) { 1253 if (ret) {
911 /* pick up pos changes done by the generic code */ 1254 btrfs_delalloc_release_space(inode,
912 pos = *ppos; 1255 dirty_pages << PAGE_CACHE_SHIFT);
913 goto out; 1256 btrfs_drop_pages(pages, num_pages);
1257 break;
1258 }
914 } 1259 }
915 /* 1260
916 * We are going to do buffered for the rest of the range, so we 1261 btrfs_drop_pages(pages, num_pages);
917 * need to make sure to invalidate the buffered pages when we're 1262
918 * done. 1263 cond_resched();
919 */ 1264
920 buffered = 1; 1265 balance_dirty_pages_ratelimited_nr(inode->i_mapping,
921 pos += num_written; 1266 dirty_pages);
1267 if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1268 btrfs_btree_balance_dirty(root, 1);
1269 btrfs_throttle(root);
1270
1271 pos += copied;
1272 num_written += copied;
922 } 1273 }
923 1274
924 iov_iter_init(&i, iov, nr_segs, count, num_written); 1275 kfree(pages);
925 nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 (sizeof(struct page *)));
928 pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
929 1276
930 /* generic_write_checks can change our pos */ 1277 return num_written ? num_written : ret;
931 start_pos = pos; 1278}
932 1279
933 first_index = pos >> PAGE_CACHE_SHIFT; 1280static ssize_t __btrfs_direct_write(struct kiocb *iocb,
934 last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT; 1281 const struct iovec *iov,
1282 unsigned long nr_segs, loff_t pos,
1283 loff_t *ppos, size_t count, size_t ocount)
1284{
1285 struct file *file = iocb->ki_filp;
1286 struct inode *inode = fdentry(file)->d_inode;
1287 struct iov_iter i;
1288 ssize_t written;
1289 ssize_t written_buffered;
1290 loff_t endbyte;
1291 int err;
1292
1293 written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
1294 count, ocount);
935 1295
936 /* 1296 /*
937 * there are lots of better ways to do this, but this code 1297 * the generic O_DIRECT will update in-memory i_size after the
938 * makes sure the first and last page in the file range are 1298 * DIOs are done. But our endio handlers that update the on
939 * up to date and ready for cow 1299 * disk i_size never update past the in memory i_size. So we
1300 * need one more update here to catch any additions to the
1301 * file
940 */ 1302 */
941 if ((pos & (PAGE_CACHE_SIZE - 1))) { 1303 if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
942 pinned[0] = grab_cache_page(inode->i_mapping, first_index); 1304 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
943 if (!PageUptodate(pinned[0])) { 1305 mark_inode_dirty(inode);
944 ret = btrfs_readpage(NULL, pinned[0]);
945 BUG_ON(ret);
946 wait_on_page_locked(pinned[0]);
947 } else {
948 unlock_page(pinned[0]);
949 }
950 } 1306 }
951 if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) { 1307
952 pinned[1] = grab_cache_page(inode->i_mapping, last_index); 1308 if (written < 0 || written == count)
953 if (!PageUptodate(pinned[1])) { 1309 return written;
954 ret = btrfs_readpage(NULL, pinned[1]); 1310
955 BUG_ON(ret); 1311 pos += written;
956 wait_on_page_locked(pinned[1]); 1312 count -= written;
957 } else { 1313 iov_iter_init(&i, iov, nr_segs, count, written);
958 unlock_page(pinned[1]); 1314 written_buffered = __btrfs_buffered_write(file, &i, pos);
959 } 1315 if (written_buffered < 0) {
1316 err = written_buffered;
1317 goto out;
960 } 1318 }
1319 endbyte = pos + written_buffered - 1;
1320 err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
1321 if (err)
1322 goto out;
1323 written += written_buffered;
1324 *ppos = pos + written_buffered;
1325 invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
1326 endbyte >> PAGE_CACHE_SHIFT);
1327out:
1328 return written ? written : err;
1329}
961 1330
962 while (iov_iter_count(&i) > 0) { 1331static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
963 size_t offset = pos & (PAGE_CACHE_SIZE - 1); 1332 const struct iovec *iov,
964 size_t write_bytes = min(iov_iter_count(&i), 1333 unsigned long nr_segs, loff_t pos)
965 nrptrs * (size_t)PAGE_CACHE_SIZE - 1334{
966 offset); 1335 struct file *file = iocb->ki_filp;
967 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> 1336 struct inode *inode = fdentry(file)->d_inode;
968 PAGE_CACHE_SHIFT; 1337 struct btrfs_root *root = BTRFS_I(inode)->root;
1338 loff_t *ppos = &iocb->ki_pos;
1339 ssize_t num_written = 0;
1340 ssize_t err = 0;
1341 size_t count, ocount;
969 1342
970 WARN_ON(num_pages > nrptrs); 1343 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
971 memset(pages, 0, sizeof(struct page *) * nrptrs);
972 1344
973 ret = btrfs_delalloc_reserve_space(inode, write_bytes); 1345 mutex_lock(&inode->i_mutex);
974 if (ret)
975 goto out;
976 1346
977 ret = prepare_pages(root, file, pages, num_pages, 1347 err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
978 pos, first_index, last_index, 1348 if (err) {
979 write_bytes); 1349 mutex_unlock(&inode->i_mutex);
980 if (ret) { 1350 goto out;
981 btrfs_delalloc_release_space(inode, write_bytes); 1351 }
982 goto out; 1352 count = ocount;
983 }
984 1353
985 ret = btrfs_copy_from_user(pos, num_pages, 1354 current->backing_dev_info = inode->i_mapping->backing_dev_info;
986 write_bytes, pages, &i); 1355 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
987 if (ret == 0) { 1356 if (err) {
988 dirty_and_release_pages(NULL, root, file, pages, 1357 mutex_unlock(&inode->i_mutex);
989 num_pages, pos, write_bytes); 1358 goto out;
990 } 1359 }
991 1360
992 btrfs_drop_pages(pages, num_pages); 1361 if (count == 0) {
993 if (ret) { 1362 mutex_unlock(&inode->i_mutex);
994 btrfs_delalloc_release_space(inode, write_bytes); 1363 goto out;
995 goto out; 1364 }
996 }
997 1365
998 if (will_write) { 1366 err = file_remove_suid(file);
999 filemap_fdatawrite_range(inode->i_mapping, pos, 1367 if (err) {
1000 pos + write_bytes - 1); 1368 mutex_unlock(&inode->i_mutex);
1001 } else { 1369 goto out;
1002 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1370 }
1003 num_pages);
1004 if (num_pages <
1005 (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1006 btrfs_btree_balance_dirty(root, 1);
1007 btrfs_throttle(root);
1008 }
1009 1371
1010 pos += write_bytes; 1372 /*
1011 num_written += write_bytes; 1373 * If BTRFS flips readonly due to some impossible error
1374 * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1375 * although we have opened a file as writable, we have
1376 * to stop this write operation to ensure FS consistency.
1377 */
1378 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
1379 mutex_unlock(&inode->i_mutex);
1380 err = -EROFS;
1381 goto out;
1382 }
1012 1383
1013 cond_resched(); 1384 file_update_time(file);
1385 BTRFS_I(inode)->sequence++;
1386
1387 if (unlikely(file->f_flags & O_DIRECT)) {
1388 num_written = __btrfs_direct_write(iocb, iov, nr_segs,
1389 pos, ppos, count, ocount);
1390 } else {
1391 struct iov_iter i;
1392
1393 iov_iter_init(&i, iov, nr_segs, count, num_written);
1394
1395 num_written = __btrfs_buffered_write(file, &i, pos);
1396 if (num_written > 0)
1397 *ppos = pos + num_written;
1014 } 1398 }
1015out:
1016 mutex_unlock(&inode->i_mutex);
1017 if (ret)
1018 err = ret;
1019 1399
1020 kfree(pages); 1400 mutex_unlock(&inode->i_mutex);
1021 if (pinned[0])
1022 page_cache_release(pinned[0]);
1023 if (pinned[1])
1024 page_cache_release(pinned[1]);
1025 *ppos = pos;
1026 1401
1027 /* 1402 /*
1028 * we want to make sure fsync finds this change 1403 * we want to make sure fsync finds this change
@@ -1037,36 +1412,12 @@ out:
1037 * one running right now. 1412 * one running right now.
1038 */ 1413 */
1039 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1; 1414 BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1040 1415 if (num_written > 0 || num_written == -EIOCBQUEUED) {
1041 if (num_written > 0 && will_write) { 1416 err = generic_write_sync(file, pos, num_written);
1042 struct btrfs_trans_handle *trans; 1417 if (err < 0 && num_written > 0)
1043
1044 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1045 if (err)
1046 num_written = err; 1418 num_written = err;
1047
1048 if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1049 trans = btrfs_start_transaction(root, 0);
1050 ret = btrfs_log_dentry_safe(trans, root,
1051 file->f_dentry);
1052 if (ret == 0) {
1053 ret = btrfs_sync_log(trans, root);
1054 if (ret == 0)
1055 btrfs_end_transaction(trans, root);
1056 else
1057 btrfs_commit_transaction(trans, root);
1058 } else if (ret != BTRFS_NO_LOG_SYNC) {
1059 btrfs_commit_transaction(trans, root);
1060 } else {
1061 btrfs_end_transaction(trans, root);
1062 }
1063 }
1064 if (file->f_flags & O_DIRECT && buffered) {
1065 invalidate_mapping_pages(inode->i_mapping,
1066 start_pos >> PAGE_CACHE_SHIFT,
1067 (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1068 }
1069 } 1419 }
1420out:
1070 current->backing_dev_info = NULL; 1421 current->backing_dev_info = NULL;
1071 return num_written ? num_written : err; 1422 return num_written ? num_written : err;
1072} 1423}
@@ -1109,6 +1460,7 @@ int btrfs_sync_file(struct file *file, int datasync)
1109 int ret = 0; 1460 int ret = 0;
1110 struct btrfs_trans_handle *trans; 1461 struct btrfs_trans_handle *trans;
1111 1462
1463 trace_btrfs_sync_file(file, datasync);
1112 1464
1113 /* we wait first, since the writeback may change the inode */ 1465 /* we wait first, since the writeback may change the inode */
1114 root->log_batch++; 1466 root->log_batch++;
@@ -1128,14 +1480,12 @@ int btrfs_sync_file(struct file *file, int datasync)
1128 * the current transaction, we can bail out now without any 1480 * the current transaction, we can bail out now without any
1129 * syncing 1481 * syncing
1130 */ 1482 */
1131 mutex_lock(&root->fs_info->trans_mutex); 1483 smp_mb();
1132 if (BTRFS_I(inode)->last_trans <= 1484 if (BTRFS_I(inode)->last_trans <=
1133 root->fs_info->last_trans_committed) { 1485 root->fs_info->last_trans_committed) {
1134 BTRFS_I(inode)->last_trans = 0; 1486 BTRFS_I(inode)->last_trans = 0;
1135 mutex_unlock(&root->fs_info->trans_mutex);
1136 goto out; 1487 goto out;
1137 } 1488 }
1138 mutex_unlock(&root->fs_info->trans_mutex);
1139 1489
1140 /* 1490 /*
1141 * ok we haven't committed the transaction yet, lets do a commit 1491 * ok we haven't committed the transaction yet, lets do a commit
@@ -1202,6 +1552,118 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
1202 return 0; 1552 return 0;
1203} 1553}
1204 1554
1555static long btrfs_fallocate(struct file *file, int mode,
1556 loff_t offset, loff_t len)
1557{
1558 struct inode *inode = file->f_path.dentry->d_inode;
1559 struct extent_state *cached_state = NULL;
1560 u64 cur_offset;
1561 u64 last_byte;
1562 u64 alloc_start;
1563 u64 alloc_end;
1564 u64 alloc_hint = 0;
1565 u64 locked_end;
1566 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
1567 struct extent_map *em;
1568 int ret;
1569
1570 alloc_start = offset & ~mask;
1571 alloc_end = (offset + len + mask) & ~mask;
1572
1573 /* We only support the FALLOC_FL_KEEP_SIZE mode */
1574 if (mode & ~FALLOC_FL_KEEP_SIZE)
1575 return -EOPNOTSUPP;
1576
1577 /*
1578 * wait for ordered IO before we have any locks. We'll loop again
1579 * below with the locks held.
1580 */
1581 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
1582
1583 mutex_lock(&inode->i_mutex);
1584 ret = inode_newsize_ok(inode, alloc_end);
1585 if (ret)
1586 goto out;
1587
1588 if (alloc_start > inode->i_size) {
1589 ret = btrfs_cont_expand(inode, i_size_read(inode),
1590 alloc_start);
1591 if (ret)
1592 goto out;
1593 }
1594
1595 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1596 if (ret)
1597 goto out;
1598
1599 locked_end = alloc_end - 1;
1600 while (1) {
1601 struct btrfs_ordered_extent *ordered;
1602
1603 /* the extent lock is ordered inside the running
1604 * transaction
1605 */
1606 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
1607 locked_end, 0, &cached_state, GFP_NOFS);
1608 ordered = btrfs_lookup_first_ordered_extent(inode,
1609 alloc_end - 1);
1610 if (ordered &&
1611 ordered->file_offset + ordered->len > alloc_start &&
1612 ordered->file_offset < alloc_end) {
1613 btrfs_put_ordered_extent(ordered);
1614 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1615 alloc_start, locked_end,
1616 &cached_state, GFP_NOFS);
1617 /*
1618 * we can't wait on the range with the transaction
1619 * running or with the extent lock held
1620 */
1621 btrfs_wait_ordered_range(inode, alloc_start,
1622 alloc_end - alloc_start);
1623 } else {
1624 if (ordered)
1625 btrfs_put_ordered_extent(ordered);
1626 break;
1627 }
1628 }
1629
1630 cur_offset = alloc_start;
1631 while (1) {
1632 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
1633 alloc_end - cur_offset, 0);
1634 BUG_ON(IS_ERR_OR_NULL(em));
1635 last_byte = min(extent_map_end(em), alloc_end);
1636 last_byte = (last_byte + mask) & ~mask;
1637 if (em->block_start == EXTENT_MAP_HOLE ||
1638 (cur_offset >= inode->i_size &&
1639 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1640 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1641 last_byte - cur_offset,
1642 1 << inode->i_blkbits,
1643 offset + len,
1644 &alloc_hint);
1645 if (ret < 0) {
1646 free_extent_map(em);
1647 break;
1648 }
1649 }
1650 free_extent_map(em);
1651
1652 cur_offset = last_byte;
1653 if (cur_offset >= alloc_end) {
1654 ret = 0;
1655 break;
1656 }
1657 }
1658 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1659 &cached_state, GFP_NOFS);
1660
1661 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1662out:
1663 mutex_unlock(&inode->i_mutex);
1664 return ret;
1665}
1666
1205const struct file_operations btrfs_file_operations = { 1667const struct file_operations btrfs_file_operations = {
1206 .llseek = generic_file_llseek, 1668 .llseek = generic_file_llseek,
1207 .read = do_sync_read, 1669 .read = do_sync_read,
@@ -1213,6 +1675,7 @@ const struct file_operations btrfs_file_operations = {
1213 .open = generic_file_open, 1675 .open = generic_file_open,
1214 .release = btrfs_release_file, 1676 .release = btrfs_release_file,
1215 .fsync = btrfs_sync_file, 1677 .fsync = btrfs_sync_file,
1678 .fallocate = btrfs_fallocate,
1216 .unlocked_ioctl = btrfs_ioctl, 1679 .unlocked_ioctl = btrfs_ioctl,
1217#ifdef CONFIG_COMPAT 1680#ifdef CONFIG_COMPAT
1218 .compat_ioctl = btrfs_ioctl, 1681 .compat_ioctl = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f488fac04d99..bf0d61567f3d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,34 +23,937 @@
23#include "ctree.h" 23#include "ctree.h"
24#include "free-space-cache.h" 24#include "free-space-cache.h"
25#include "transaction.h" 25#include "transaction.h"
26#include "disk-io.h"
27#include "extent_io.h"
28#include "inode-map.h"
26 29
27#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) 30#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8)
28#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) 31#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
29 32
30static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, 33static int link_free_space(struct btrfs_free_space_ctl *ctl,
34 struct btrfs_free_space *info);
35
36static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
37 struct btrfs_path *path,
38 u64 offset)
39{
40 struct btrfs_key key;
41 struct btrfs_key location;
42 struct btrfs_disk_key disk_key;
43 struct btrfs_free_space_header *header;
44 struct extent_buffer *leaf;
45 struct inode *inode = NULL;
46 int ret;
47
48 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
49 key.offset = offset;
50 key.type = 0;
51
52 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
53 if (ret < 0)
54 return ERR_PTR(ret);
55 if (ret > 0) {
56 btrfs_release_path(path);
57 return ERR_PTR(-ENOENT);
58 }
59
60 leaf = path->nodes[0];
61 header = btrfs_item_ptr(leaf, path->slots[0],
62 struct btrfs_free_space_header);
63 btrfs_free_space_key(leaf, header, &disk_key);
64 btrfs_disk_key_to_cpu(&location, &disk_key);
65 btrfs_release_path(path);
66
67 inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
68 if (!inode)
69 return ERR_PTR(-ENOENT);
70 if (IS_ERR(inode))
71 return inode;
72 if (is_bad_inode(inode)) {
73 iput(inode);
74 return ERR_PTR(-ENOENT);
75 }
76
77 inode->i_mapping->flags &= ~__GFP_FS;
78
79 return inode;
80}
81
82struct inode *lookup_free_space_inode(struct btrfs_root *root,
83 struct btrfs_block_group_cache
84 *block_group, struct btrfs_path *path)
85{
86 struct inode *inode = NULL;
87
88 spin_lock(&block_group->lock);
89 if (block_group->inode)
90 inode = igrab(block_group->inode);
91 spin_unlock(&block_group->lock);
92 if (inode)
93 return inode;
94
95 inode = __lookup_free_space_inode(root, path,
96 block_group->key.objectid);
97 if (IS_ERR(inode))
98 return inode;
99
100 spin_lock(&block_group->lock);
101 if (!btrfs_fs_closing(root->fs_info)) {
102 block_group->inode = igrab(inode);
103 block_group->iref = 1;
104 }
105 spin_unlock(&block_group->lock);
106
107 return inode;
108}
109
110int __create_free_space_inode(struct btrfs_root *root,
111 struct btrfs_trans_handle *trans,
112 struct btrfs_path *path, u64 ino, u64 offset)
113{
114 struct btrfs_key key;
115 struct btrfs_disk_key disk_key;
116 struct btrfs_free_space_header *header;
117 struct btrfs_inode_item *inode_item;
118 struct extent_buffer *leaf;
119 int ret;
120
121 ret = btrfs_insert_empty_inode(trans, root, path, ino);
122 if (ret)
123 return ret;
124
125 leaf = path->nodes[0];
126 inode_item = btrfs_item_ptr(leaf, path->slots[0],
127 struct btrfs_inode_item);
128 btrfs_item_key(leaf, &disk_key, path->slots[0]);
129 memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
130 sizeof(*inode_item));
131 btrfs_set_inode_generation(leaf, inode_item, trans->transid);
132 btrfs_set_inode_size(leaf, inode_item, 0);
133 btrfs_set_inode_nbytes(leaf, inode_item, 0);
134 btrfs_set_inode_uid(leaf, inode_item, 0);
135 btrfs_set_inode_gid(leaf, inode_item, 0);
136 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
137 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
138 BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
139 btrfs_set_inode_nlink(leaf, inode_item, 1);
140 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
141 btrfs_set_inode_block_group(leaf, inode_item, offset);
142 btrfs_mark_buffer_dirty(leaf);
143 btrfs_release_path(path);
144
145 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
146 key.offset = offset;
147 key.type = 0;
148
149 ret = btrfs_insert_empty_item(trans, root, path, &key,
150 sizeof(struct btrfs_free_space_header));
151 if (ret < 0) {
152 btrfs_release_path(path);
153 return ret;
154 }
155 leaf = path->nodes[0];
156 header = btrfs_item_ptr(leaf, path->slots[0],
157 struct btrfs_free_space_header);
158 memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
159 btrfs_set_free_space_key(leaf, header, &disk_key);
160 btrfs_mark_buffer_dirty(leaf);
161 btrfs_release_path(path);
162
163 return 0;
164}
165
166int create_free_space_inode(struct btrfs_root *root,
167 struct btrfs_trans_handle *trans,
168 struct btrfs_block_group_cache *block_group,
169 struct btrfs_path *path)
170{
171 int ret;
172 u64 ino;
173
174 ret = btrfs_find_free_objectid(root, &ino);
175 if (ret < 0)
176 return ret;
177
178 return __create_free_space_inode(root, trans, path, ino,
179 block_group->key.objectid);
180}
181
182int btrfs_truncate_free_space_cache(struct btrfs_root *root,
183 struct btrfs_trans_handle *trans,
184 struct btrfs_path *path,
185 struct inode *inode)
186{
187 loff_t oldsize;
188 int ret = 0;
189
190 trans->block_rsv = root->orphan_block_rsv;
191 ret = btrfs_block_rsv_check(trans, root,
192 root->orphan_block_rsv,
193 0, 5);
194 if (ret)
195 return ret;
196
197 oldsize = i_size_read(inode);
198 btrfs_i_size_write(inode, 0);
199 truncate_pagecache(inode, oldsize, 0);
200
201 /*
202 * We don't need an orphan item because truncating the free space cache
203 * will never be split across transactions.
204 */
205 ret = btrfs_truncate_inode_items(trans, root, inode,
206 0, BTRFS_EXTENT_DATA_KEY);
207 if (ret) {
208 WARN_ON(1);
209 return ret;
210 }
211
212 ret = btrfs_update_inode(trans, root, inode);
213 return ret;
214}
215
216static int readahead_cache(struct inode *inode)
217{
218 struct file_ra_state *ra;
219 unsigned long last_index;
220
221 ra = kzalloc(sizeof(*ra), GFP_NOFS);
222 if (!ra)
223 return -ENOMEM;
224
225 file_ra_state_init(ra, inode->i_mapping);
226 last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
227
228 page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
229
230 kfree(ra);
231
232 return 0;
233}
234
235int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
236 struct btrfs_free_space_ctl *ctl,
237 struct btrfs_path *path, u64 offset)
238{
239 struct btrfs_free_space_header *header;
240 struct extent_buffer *leaf;
241 struct page *page;
242 u32 *checksums = NULL, *crc;
243 char *disk_crcs = NULL;
244 struct btrfs_key key;
245 struct list_head bitmaps;
246 u64 num_entries;
247 u64 num_bitmaps;
248 u64 generation;
249 u32 cur_crc = ~(u32)0;
250 pgoff_t index = 0;
251 unsigned long first_page_offset;
252 int num_checksums;
253 int ret = 0;
254
255 INIT_LIST_HEAD(&bitmaps);
256
257 /* Nothing in the space cache, goodbye */
258 if (!i_size_read(inode))
259 goto out;
260
261 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
262 key.offset = offset;
263 key.type = 0;
264
265 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
266 if (ret < 0)
267 goto out;
268 else if (ret > 0) {
269 btrfs_release_path(path);
270 ret = 0;
271 goto out;
272 }
273
274 ret = -1;
275
276 leaf = path->nodes[0];
277 header = btrfs_item_ptr(leaf, path->slots[0],
278 struct btrfs_free_space_header);
279 num_entries = btrfs_free_space_entries(leaf, header);
280 num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
281 generation = btrfs_free_space_generation(leaf, header);
282 btrfs_release_path(path);
283
284 if (BTRFS_I(inode)->generation != generation) {
285 printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
286 " not match free space cache generation (%llu)\n",
287 (unsigned long long)BTRFS_I(inode)->generation,
288 (unsigned long long)generation);
289 goto out;
290 }
291
292 if (!num_entries)
293 goto out;
294
295 /* Setup everything for doing checksumming */
296 num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
297 checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
298 if (!checksums)
299 goto out;
300 first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
301 disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
302 if (!disk_crcs)
303 goto out;
304
305 ret = readahead_cache(inode);
306 if (ret)
307 goto out;
308
309 while (1) {
310 struct btrfs_free_space_entry *entry;
311 struct btrfs_free_space *e;
312 void *addr;
313 unsigned long offset = 0;
314 unsigned long start_offset = 0;
315 int need_loop = 0;
316
317 if (!num_entries && !num_bitmaps)
318 break;
319
320 if (index == 0) {
321 start_offset = first_page_offset;
322 offset = start_offset;
323 }
324
325 page = grab_cache_page(inode->i_mapping, index);
326 if (!page)
327 goto free_cache;
328
329 if (!PageUptodate(page)) {
330 btrfs_readpage(NULL, page);
331 lock_page(page);
332 if (!PageUptodate(page)) {
333 unlock_page(page);
334 page_cache_release(page);
335 printk(KERN_ERR "btrfs: error reading free "
336 "space cache\n");
337 goto free_cache;
338 }
339 }
340 addr = kmap(page);
341
342 if (index == 0) {
343 u64 *gen;
344
345 memcpy(disk_crcs, addr, first_page_offset);
346 gen = addr + (sizeof(u32) * num_checksums);
347 if (*gen != BTRFS_I(inode)->generation) {
348 printk(KERN_ERR "btrfs: space cache generation"
349 " (%llu) does not match inode (%llu)\n",
350 (unsigned long long)*gen,
351 (unsigned long long)
352 BTRFS_I(inode)->generation);
353 kunmap(page);
354 unlock_page(page);
355 page_cache_release(page);
356 goto free_cache;
357 }
358 crc = (u32 *)disk_crcs;
359 }
360 entry = addr + start_offset;
361
362 /* First lets check our crc before we do anything fun */
363 cur_crc = ~(u32)0;
364 cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
365 PAGE_CACHE_SIZE - start_offset);
366 btrfs_csum_final(cur_crc, (char *)&cur_crc);
367 if (cur_crc != *crc) {
368 printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
369 index);
370 kunmap(page);
371 unlock_page(page);
372 page_cache_release(page);
373 goto free_cache;
374 }
375 crc++;
376
377 while (1) {
378 if (!num_entries)
379 break;
380
381 need_loop = 1;
382 e = kmem_cache_zalloc(btrfs_free_space_cachep,
383 GFP_NOFS);
384 if (!e) {
385 kunmap(page);
386 unlock_page(page);
387 page_cache_release(page);
388 goto free_cache;
389 }
390
391 e->offset = le64_to_cpu(entry->offset);
392 e->bytes = le64_to_cpu(entry->bytes);
393 if (!e->bytes) {
394 kunmap(page);
395 kmem_cache_free(btrfs_free_space_cachep, e);
396 unlock_page(page);
397 page_cache_release(page);
398 goto free_cache;
399 }
400
401 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
402 spin_lock(&ctl->tree_lock);
403 ret = link_free_space(ctl, e);
404 spin_unlock(&ctl->tree_lock);
405 if (ret) {
406 printk(KERN_ERR "Duplicate entries in "
407 "free space cache, dumping\n");
408 kunmap(page);
409 unlock_page(page);
410 page_cache_release(page);
411 goto free_cache;
412 }
413 } else {
414 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
415 if (!e->bitmap) {
416 kunmap(page);
417 kmem_cache_free(
418 btrfs_free_space_cachep, e);
419 unlock_page(page);
420 page_cache_release(page);
421 goto free_cache;
422 }
423 spin_lock(&ctl->tree_lock);
424 ret = link_free_space(ctl, e);
425 ctl->total_bitmaps++;
426 ctl->op->recalc_thresholds(ctl);
427 spin_unlock(&ctl->tree_lock);
428 if (ret) {
429 printk(KERN_ERR "Duplicate entries in "
430 "free space cache, dumping\n");
431 kunmap(page);
432 unlock_page(page);
433 page_cache_release(page);
434 goto free_cache;
435 }
436 list_add_tail(&e->list, &bitmaps);
437 }
438
439 num_entries--;
440 offset += sizeof(struct btrfs_free_space_entry);
441 if (offset + sizeof(struct btrfs_free_space_entry) >=
442 PAGE_CACHE_SIZE)
443 break;
444 entry++;
445 }
446
447 /*
448 * We read an entry out of this page, we need to move on to the
449 * next page.
450 */
451 if (need_loop) {
452 kunmap(page);
453 goto next;
454 }
455
456 /*
457 * We add the bitmaps at the end of the entries in order that
458 * the bitmap entries are added to the cache.
459 */
460 e = list_entry(bitmaps.next, struct btrfs_free_space, list);
461 list_del_init(&e->list);
462 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
463 kunmap(page);
464 num_bitmaps--;
465next:
466 unlock_page(page);
467 page_cache_release(page);
468 index++;
469 }
470
471 ret = 1;
472out:
473 kfree(checksums);
474 kfree(disk_crcs);
475 return ret;
476free_cache:
477 __btrfs_remove_free_space_cache(ctl);
478 goto out;
479}
480
481int load_free_space_cache(struct btrfs_fs_info *fs_info,
482 struct btrfs_block_group_cache *block_group)
483{
484 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
485 struct btrfs_root *root = fs_info->tree_root;
486 struct inode *inode;
487 struct btrfs_path *path;
488 int ret;
489 bool matched;
490 u64 used = btrfs_block_group_used(&block_group->item);
491
492 /*
493 * If we're unmounting then just return, since this does a search on the
494 * normal root and not the commit root and we could deadlock.
495 */
496 if (btrfs_fs_closing(fs_info))
497 return 0;
498
499 /*
500 * If this block group has been marked to be cleared for one reason or
501 * another then we can't trust the on disk cache, so just return.
502 */
503 spin_lock(&block_group->lock);
504 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
505 spin_unlock(&block_group->lock);
506 return 0;
507 }
508 spin_unlock(&block_group->lock);
509
510 path = btrfs_alloc_path();
511 if (!path)
512 return 0;
513
514 inode = lookup_free_space_inode(root, block_group, path);
515 if (IS_ERR(inode)) {
516 btrfs_free_path(path);
517 return 0;
518 }
519
520 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
521 path, block_group->key.objectid);
522 btrfs_free_path(path);
523 if (ret <= 0)
524 goto out;
525
526 spin_lock(&ctl->tree_lock);
527 matched = (ctl->free_space == (block_group->key.offset - used -
528 block_group->bytes_super));
529 spin_unlock(&ctl->tree_lock);
530
531 if (!matched) {
532 __btrfs_remove_free_space_cache(ctl);
533 printk(KERN_ERR "block group %llu has an wrong amount of free "
534 "space\n", block_group->key.objectid);
535 ret = -1;
536 }
537out:
538 if (ret < 0) {
539 /* This cache is bogus, make sure it gets cleared */
540 spin_lock(&block_group->lock);
541 block_group->disk_cache_state = BTRFS_DC_CLEAR;
542 spin_unlock(&block_group->lock);
543 ret = 0;
544
545 printk(KERN_ERR "btrfs: failed to load free space cache "
546 "for block group %llu\n", block_group->key.objectid);
547 }
548
549 iput(inode);
550 return ret;
551}
552
553int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
554 struct btrfs_free_space_ctl *ctl,
555 struct btrfs_block_group_cache *block_group,
556 struct btrfs_trans_handle *trans,
557 struct btrfs_path *path, u64 offset)
558{
559 struct btrfs_free_space_header *header;
560 struct extent_buffer *leaf;
561 struct rb_node *node;
562 struct list_head *pos, *n;
563 struct page **pages;
564 struct page *page;
565 struct extent_state *cached_state = NULL;
566 struct btrfs_free_cluster *cluster = NULL;
567 struct extent_io_tree *unpin = NULL;
568 struct list_head bitmap_list;
569 struct btrfs_key key;
570 u64 start, end, len;
571 u64 bytes = 0;
572 u32 *crc, *checksums;
573 unsigned long first_page_offset;
574 int index = 0, num_pages = 0;
575 int entries = 0;
576 int bitmaps = 0;
577 int ret = -1;
578 bool next_page = false;
579 bool out_of_space = false;
580
581 INIT_LIST_HEAD(&bitmap_list);
582
583 node = rb_first(&ctl->free_space_offset);
584 if (!node)
585 return 0;
586
587 if (!i_size_read(inode))
588 return -1;
589
590 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
591 PAGE_CACHE_SHIFT;
592
593 /* Since the first page has all of our checksums and our generation we
594 * need to calculate the offset into the page that we can start writing
595 * our entries.
596 */
597 first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
598
599 filemap_write_and_wait(inode->i_mapping);
600 btrfs_wait_ordered_range(inode, inode->i_size &
601 ~(root->sectorsize - 1), (u64)-1);
602
603 /* make sure we don't overflow that first page */
604 if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
605 /* this is really the same as running out of space, where we also return 0 */
606 printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
607 ret = 0;
608 goto out_update;
609 }
610
611 /* We need a checksum per page. */
612 crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
613 if (!crc)
614 return -1;
615
616 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
617 if (!pages) {
618 kfree(crc);
619 return -1;
620 }
621
622 /* Get the cluster for this block_group if it exists */
623 if (block_group && !list_empty(&block_group->cluster_list))
624 cluster = list_entry(block_group->cluster_list.next,
625 struct btrfs_free_cluster,
626 block_group_list);
627
628 /*
629 * We shouldn't have switched the pinned extents yet so this is the
630 * right one
631 */
632 unpin = root->fs_info->pinned_extents;
633
634 /*
635 * Lock all pages first so we can lock the extent safely.
636 *
637 * NOTE: Because we hold the ref the entire time we're going to write to
638 * the page find_get_page should never fail, so we don't do a check
639 * after find_get_page at this point. Just putting this here so people
640 * know and don't freak out.
641 */
642 while (index < num_pages) {
643 page = grab_cache_page(inode->i_mapping, index);
644 if (!page) {
645 int i;
646
647 for (i = 0; i < num_pages; i++) {
648 unlock_page(pages[i]);
649 page_cache_release(pages[i]);
650 }
651 goto out_free;
652 }
653 pages[index] = page;
654 index++;
655 }
656
657 index = 0;
658 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
659 0, &cached_state, GFP_NOFS);
660
661 /*
662 * When searching for pinned extents, we need to start at our start
663 * offset.
664 */
665 if (block_group)
666 start = block_group->key.objectid;
667
668 /* Write out the extent entries */
669 do {
670 struct btrfs_free_space_entry *entry;
671 void *addr;
672 unsigned long offset = 0;
673 unsigned long start_offset = 0;
674
675 next_page = false;
676
677 if (index == 0) {
678 start_offset = first_page_offset;
679 offset = start_offset;
680 }
681
682 if (index >= num_pages) {
683 out_of_space = true;
684 break;
685 }
686
687 page = pages[index];
688
689 addr = kmap(page);
690 entry = addr + start_offset;
691
692 memset(addr, 0, PAGE_CACHE_SIZE);
693 while (node && !next_page) {
694 struct btrfs_free_space *e;
695
696 e = rb_entry(node, struct btrfs_free_space, offset_index);
697 entries++;
698
699 entry->offset = cpu_to_le64(e->offset);
700 entry->bytes = cpu_to_le64(e->bytes);
701 if (e->bitmap) {
702 entry->type = BTRFS_FREE_SPACE_BITMAP;
703 list_add_tail(&e->list, &bitmap_list);
704 bitmaps++;
705 } else {
706 entry->type = BTRFS_FREE_SPACE_EXTENT;
707 }
708 node = rb_next(node);
709 if (!node && cluster) {
710 node = rb_first(&cluster->root);
711 cluster = NULL;
712 }
713 offset += sizeof(struct btrfs_free_space_entry);
714 if (offset + sizeof(struct btrfs_free_space_entry) >=
715 PAGE_CACHE_SIZE)
716 next_page = true;
717 entry++;
718 }
719
720 /*
721 * We want to add any pinned extents to our free space cache
722 * so we don't leak the space
723 */
724 while (block_group && !next_page &&
725 (start < block_group->key.objectid +
726 block_group->key.offset)) {
727 ret = find_first_extent_bit(unpin, start, &start, &end,
728 EXTENT_DIRTY);
729 if (ret) {
730 ret = 0;
731 break;
732 }
733
734 /* This pinned extent is out of our range */
735 if (start >= block_group->key.objectid +
736 block_group->key.offset)
737 break;
738
739 len = block_group->key.objectid +
740 block_group->key.offset - start;
741 len = min(len, end + 1 - start);
742
743 entries++;
744 entry->offset = cpu_to_le64(start);
745 entry->bytes = cpu_to_le64(len);
746 entry->type = BTRFS_FREE_SPACE_EXTENT;
747
748 start = end + 1;
749 offset += sizeof(struct btrfs_free_space_entry);
750 if (offset + sizeof(struct btrfs_free_space_entry) >=
751 PAGE_CACHE_SIZE)
752 next_page = true;
753 entry++;
754 }
755 *crc = ~(u32)0;
756 *crc = btrfs_csum_data(root, addr + start_offset, *crc,
757 PAGE_CACHE_SIZE - start_offset);
758 kunmap(page);
759
760 btrfs_csum_final(*crc, (char *)crc);
761 crc++;
762
763 bytes += PAGE_CACHE_SIZE;
764
765 index++;
766 } while (node || next_page);
767
768 /* Write out the bitmaps */
769 list_for_each_safe(pos, n, &bitmap_list) {
770 void *addr;
771 struct btrfs_free_space *entry =
772 list_entry(pos, struct btrfs_free_space, list);
773
774 if (index >= num_pages) {
775 out_of_space = true;
776 break;
777 }
778 page = pages[index];
779
780 addr = kmap(page);
781 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
782 *crc = ~(u32)0;
783 *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
784 kunmap(page);
785 btrfs_csum_final(*crc, (char *)crc);
786 crc++;
787 bytes += PAGE_CACHE_SIZE;
788
789 list_del_init(&entry->list);
790 index++;
791 }
792
793 if (out_of_space) {
794 btrfs_drop_pages(pages, num_pages);
795 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
796 i_size_read(inode) - 1, &cached_state,
797 GFP_NOFS);
798 ret = 0;
799 goto out_free;
800 }
801
802 /* Zero out the rest of the pages just to make sure */
803 while (index < num_pages) {
804 void *addr;
805
806 page = pages[index];
807 addr = kmap(page);
808 memset(addr, 0, PAGE_CACHE_SIZE);
809 kunmap(page);
810 bytes += PAGE_CACHE_SIZE;
811 index++;
812 }
813
814 /* Write the checksums and trans id to the first page */
815 {
816 void *addr;
817 u64 *gen;
818
819 page = pages[0];
820
821 addr = kmap(page);
822 memcpy(addr, checksums, sizeof(u32) * num_pages);
823 gen = addr + (sizeof(u32) * num_pages);
824 *gen = trans->transid;
825 kunmap(page);
826 }
827
828 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
829 bytes, &cached_state);
830 btrfs_drop_pages(pages, num_pages);
831 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
832 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
833
834 if (ret) {
835 ret = 0;
836 goto out_free;
837 }
838
839 BTRFS_I(inode)->generation = trans->transid;
840
841 filemap_write_and_wait(inode->i_mapping);
842
843 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
844 key.offset = offset;
845 key.type = 0;
846
847 ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
848 if (ret < 0) {
849 ret = -1;
850 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
851 EXTENT_DIRTY | EXTENT_DELALLOC |
852 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
853 goto out_free;
854 }
855 leaf = path->nodes[0];
856 if (ret > 0) {
857 struct btrfs_key found_key;
858 BUG_ON(!path->slots[0]);
859 path->slots[0]--;
860 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
861 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
862 found_key.offset != offset) {
863 ret = -1;
864 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
865 EXTENT_DIRTY | EXTENT_DELALLOC |
866 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
867 GFP_NOFS);
868 btrfs_release_path(path);
869 goto out_free;
870 }
871 }
872 header = btrfs_item_ptr(leaf, path->slots[0],
873 struct btrfs_free_space_header);
874 btrfs_set_free_space_entries(leaf, header, entries);
875 btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
876 btrfs_set_free_space_generation(leaf, header, trans->transid);
877 btrfs_mark_buffer_dirty(leaf);
878 btrfs_release_path(path);
879
880 ret = 1;
881
882out_free:
883 kfree(checksums);
884 kfree(pages);
885
886out_update:
887 if (ret != 1) {
888 invalidate_inode_pages2_range(inode->i_mapping, 0, index);
889 BTRFS_I(inode)->generation = 0;
890 }
891 btrfs_update_inode(trans, root, inode);
892 return ret;
893}
894
895int btrfs_write_out_cache(struct btrfs_root *root,
896 struct btrfs_trans_handle *trans,
897 struct btrfs_block_group_cache *block_group,
898 struct btrfs_path *path)
899{
900 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
901 struct inode *inode;
902 int ret = 0;
903
904 root = root->fs_info->tree_root;
905
906 spin_lock(&block_group->lock);
907 if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
908 spin_unlock(&block_group->lock);
909 return 0;
910 }
911 spin_unlock(&block_group->lock);
912
913 inode = lookup_free_space_inode(root, block_group, path);
914 if (IS_ERR(inode))
915 return 0;
916
917 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
918 path, block_group->key.objectid);
919 if (ret < 0) {
920 spin_lock(&block_group->lock);
921 block_group->disk_cache_state = BTRFS_DC_ERROR;
922 spin_unlock(&block_group->lock);
923 ret = 0;
924
925 printk(KERN_ERR "btrfs: failed to write free space cace "
926 "for block group %llu\n", block_group->key.objectid);
927 }
928
929 iput(inode);
930 return ret;
931}
932
933static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
31 u64 offset) 934 u64 offset)
32{ 935{
33 BUG_ON(offset < bitmap_start); 936 BUG_ON(offset < bitmap_start);
34 offset -= bitmap_start; 937 offset -= bitmap_start;
35 return (unsigned long)(div64_u64(offset, sectorsize)); 938 return (unsigned long)(div_u64(offset, unit));
36} 939}
37 940
38static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) 941static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
39{ 942{
40 return (unsigned long)(div64_u64(bytes, sectorsize)); 943 return (unsigned long)(div_u64(bytes, unit));
41} 944}
42 945
43static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, 946static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
44 u64 offset) 947 u64 offset)
45{ 948{
46 u64 bitmap_start; 949 u64 bitmap_start;
47 u64 bytes_per_bitmap; 950 u64 bytes_per_bitmap;
48 951
49 bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; 952 bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
50 bitmap_start = offset - block_group->key.objectid; 953 bitmap_start = offset - ctl->start;
51 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); 954 bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
52 bitmap_start *= bytes_per_bitmap; 955 bitmap_start *= bytes_per_bitmap;
53 bitmap_start += block_group->key.objectid; 956 bitmap_start += ctl->start;
54 957
55 return bitmap_start; 958 return bitmap_start;
56} 959}
@@ -85,10 +988,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
85 * logically. 988 * logically.
86 */ 989 */
87 if (bitmap) { 990 if (bitmap) {
88 WARN_ON(info->bitmap); 991 if (info->bitmap) {
992 WARN_ON_ONCE(1);
993 return -EEXIST;
994 }
89 p = &(*p)->rb_right; 995 p = &(*p)->rb_right;
90 } else { 996 } else {
91 WARN_ON(!info->bitmap); 997 if (!info->bitmap) {
998 WARN_ON_ONCE(1);
999 return -EEXIST;
1000 }
92 p = &(*p)->rb_left; 1001 p = &(*p)->rb_left;
93 } 1002 }
94 } 1003 }
@@ -108,10 +1017,10 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
108 * offset. 1017 * offset.
109 */ 1018 */
110static struct btrfs_free_space * 1019static struct btrfs_free_space *
111tree_search_offset(struct btrfs_block_group_cache *block_group, 1020tree_search_offset(struct btrfs_free_space_ctl *ctl,
112 u64 offset, int bitmap_only, int fuzzy) 1021 u64 offset, int bitmap_only, int fuzzy)
113{ 1022{
114 struct rb_node *n = block_group->free_space_offset.rb_node; 1023 struct rb_node *n = ctl->free_space_offset.rb_node;
115 struct btrfs_free_space *entry, *prev = NULL; 1024 struct btrfs_free_space *entry, *prev = NULL;
116 1025
117 /* find entry that is closest to the 'offset' */ 1026 /* find entry that is closest to the 'offset' */
@@ -207,8 +1116,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
207 break; 1116 break;
208 } 1117 }
209 } 1118 }
210 if (entry->offset + BITS_PER_BITMAP * 1119 if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
211 block_group->sectorsize > offset)
212 return entry; 1120 return entry;
213 } else if (entry->offset + entry->bytes > offset) 1121 } else if (entry->offset + entry->bytes > offset)
214 return entry; 1122 return entry;
@@ -219,7 +1127,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
219 while (1) { 1127 while (1) {
220 if (entry->bitmap) { 1128 if (entry->bitmap) {
221 if (entry->offset + BITS_PER_BITMAP * 1129 if (entry->offset + BITS_PER_BITMAP *
222 block_group->sectorsize > offset) 1130 ctl->unit > offset)
223 break; 1131 break;
224 } else { 1132 } else {
225 if (entry->offset + entry->bytes > offset) 1133 if (entry->offset + entry->bytes > offset)
@@ -234,53 +1142,69 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
234 return entry; 1142 return entry;
235} 1143}
236 1144
237static void unlink_free_space(struct btrfs_block_group_cache *block_group, 1145static inline void
1146__unlink_free_space(struct btrfs_free_space_ctl *ctl,
1147 struct btrfs_free_space *info)
1148{
1149 rb_erase(&info->offset_index, &ctl->free_space_offset);
1150 ctl->free_extents--;
1151}
1152
1153static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
238 struct btrfs_free_space *info) 1154 struct btrfs_free_space *info)
239{ 1155{
240 rb_erase(&info->offset_index, &block_group->free_space_offset); 1156 __unlink_free_space(ctl, info);
241 block_group->free_extents--; 1157 ctl->free_space -= info->bytes;
242 block_group->free_space -= info->bytes;
243} 1158}
244 1159
245static int link_free_space(struct btrfs_block_group_cache *block_group, 1160static int link_free_space(struct btrfs_free_space_ctl *ctl,
246 struct btrfs_free_space *info) 1161 struct btrfs_free_space *info)
247{ 1162{
248 int ret = 0; 1163 int ret = 0;
249 1164
250 BUG_ON(!info->bitmap && !info->bytes); 1165 BUG_ON(!info->bitmap && !info->bytes);
251 ret = tree_insert_offset(&block_group->free_space_offset, info->offset, 1166 ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
252 &info->offset_index, (info->bitmap != NULL)); 1167 &info->offset_index, (info->bitmap != NULL));
253 if (ret) 1168 if (ret)
254 return ret; 1169 return ret;
255 1170
256 block_group->free_space += info->bytes; 1171 ctl->free_space += info->bytes;
257 block_group->free_extents++; 1172 ctl->free_extents++;
258 return ret; 1173 return ret;
259} 1174}
260 1175
261static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) 1176static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
262{ 1177{
1178 struct btrfs_block_group_cache *block_group = ctl->private;
263 u64 max_bytes; 1179 u64 max_bytes;
264 u64 bitmap_bytes; 1180 u64 bitmap_bytes;
265 u64 extent_bytes; 1181 u64 extent_bytes;
1182 u64 size = block_group->key.offset;
1183 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
1184 int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1185
1186 BUG_ON(ctl->total_bitmaps > max_bitmaps);
266 1187
267 /* 1188 /*
268 * The goal is to keep the total amount of memory used per 1gb of space 1189 * The goal is to keep the total amount of memory used per 1gb of space
269 * at or below 32k, so we need to adjust how much memory we allow to be 1190 * at or below 32k, so we need to adjust how much memory we allow to be
270 * used by extent based free space tracking 1191 * used by extent based free space tracking
271 */ 1192 */
272 max_bytes = MAX_CACHE_BYTES_PER_GIG * 1193 if (size < 1024 * 1024 * 1024)
273 (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); 1194 max_bytes = MAX_CACHE_BYTES_PER_GIG;
1195 else
1196 max_bytes = MAX_CACHE_BYTES_PER_GIG *
1197 div64_u64(size, 1024 * 1024 * 1024);
274 1198
275 /* 1199 /*
276 * we want to account for 1 more bitmap than what we have so we can make 1200 * we want to account for 1 more bitmap than what we have so we can make
277 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as 1201 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
278 * we add more bitmaps. 1202 * we add more bitmaps.
279 */ 1203 */
280 bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE; 1204 bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
281 1205
282 if (bitmap_bytes >= max_bytes) { 1206 if (bitmap_bytes >= max_bytes) {
283 block_group->extents_thresh = 0; 1207 ctl->extents_thresh = 0;
284 return; 1208 return;
285 } 1209 }
286 1210
@@ -291,47 +1215,43 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
291 extent_bytes = max_bytes - bitmap_bytes; 1215 extent_bytes = max_bytes - bitmap_bytes;
292 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2)); 1216 extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
293 1217
294 block_group->extents_thresh = 1218 ctl->extents_thresh =
295 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space))); 1219 div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
296} 1220}
297 1221
298static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, 1222static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
299 struct btrfs_free_space *info, u64 offset, 1223 struct btrfs_free_space *info, u64 offset,
300 u64 bytes) 1224 u64 bytes)
301{ 1225{
302 unsigned long start, end; 1226 unsigned long start, count;
303 unsigned long i;
304 1227
305 start = offset_to_bit(info->offset, block_group->sectorsize, offset); 1228 start = offset_to_bit(info->offset, ctl->unit, offset);
306 end = start + bytes_to_bits(bytes, block_group->sectorsize); 1229 count = bytes_to_bits(bytes, ctl->unit);
307 BUG_ON(end > BITS_PER_BITMAP); 1230 BUG_ON(start + count > BITS_PER_BITMAP);
308 1231
309 for (i = start; i < end; i++) 1232 bitmap_clear(info->bitmap, start, count);
310 clear_bit(i, info->bitmap);
311 1233
312 info->bytes -= bytes; 1234 info->bytes -= bytes;
313 block_group->free_space -= bytes; 1235 ctl->free_space -= bytes;
314} 1236}
315 1237
316static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, 1238static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
317 struct btrfs_free_space *info, u64 offset, 1239 struct btrfs_free_space *info, u64 offset,
318 u64 bytes) 1240 u64 bytes)
319{ 1241{
320 unsigned long start, end; 1242 unsigned long start, count;
321 unsigned long i;
322 1243
323 start = offset_to_bit(info->offset, block_group->sectorsize, offset); 1244 start = offset_to_bit(info->offset, ctl->unit, offset);
324 end = start + bytes_to_bits(bytes, block_group->sectorsize); 1245 count = bytes_to_bits(bytes, ctl->unit);
325 BUG_ON(end > BITS_PER_BITMAP); 1246 BUG_ON(start + count > BITS_PER_BITMAP);
326 1247
327 for (i = start; i < end; i++) 1248 bitmap_set(info->bitmap, start, count);
328 set_bit(i, info->bitmap);
329 1249
330 info->bytes += bytes; 1250 info->bytes += bytes;
331 block_group->free_space += bytes; 1251 ctl->free_space += bytes;
332} 1252}
333 1253
334static int search_bitmap(struct btrfs_block_group_cache *block_group, 1254static int search_bitmap(struct btrfs_free_space_ctl *ctl,
335 struct btrfs_free_space *bitmap_info, u64 *offset, 1255 struct btrfs_free_space *bitmap_info, u64 *offset,
336 u64 *bytes) 1256 u64 *bytes)
337{ 1257{
@@ -339,9 +1259,9 @@ static int search_bitmap(struct btrfs_block_group_cache *block_group,
339 unsigned long bits, i; 1259 unsigned long bits, i;
340 unsigned long next_zero; 1260 unsigned long next_zero;
341 1261
342 i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, 1262 i = offset_to_bit(bitmap_info->offset, ctl->unit,
343 max_t(u64, *offset, bitmap_info->offset)); 1263 max_t(u64, *offset, bitmap_info->offset));
344 bits = bytes_to_bits(*bytes, block_group->sectorsize); 1264 bits = bytes_to_bits(*bytes, ctl->unit);
345 1265
346 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); 1266 for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
347 i < BITS_PER_BITMAP; 1267 i < BITS_PER_BITMAP;
@@ -356,29 +1276,25 @@ static int search_bitmap(struct btrfs_block_group_cache *block_group,
356 } 1276 }
357 1277
358 if (found_bits) { 1278 if (found_bits) {
359 *offset = (u64)(i * block_group->sectorsize) + 1279 *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
360 bitmap_info->offset; 1280 *bytes = (u64)(found_bits) * ctl->unit;
361 *bytes = (u64)(found_bits) * block_group->sectorsize;
362 return 0; 1281 return 0;
363 } 1282 }
364 1283
365 return -1; 1284 return -1;
366} 1285}
367 1286
368static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache 1287static struct btrfs_free_space *
369 *block_group, u64 *offset, 1288find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
370 u64 *bytes, int debug)
371{ 1289{
372 struct btrfs_free_space *entry; 1290 struct btrfs_free_space *entry;
373 struct rb_node *node; 1291 struct rb_node *node;
374 int ret; 1292 int ret;
375 1293
376 if (!block_group->free_space_offset.rb_node) 1294 if (!ctl->free_space_offset.rb_node)
377 return NULL; 1295 return NULL;
378 1296
379 entry = tree_search_offset(block_group, 1297 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
380 offset_to_bitmap(block_group, *offset),
381 0, 1);
382 if (!entry) 1298 if (!entry)
383 return NULL; 1299 return NULL;
384 1300
@@ -388,7 +1304,7 @@ static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
388 continue; 1304 continue;
389 1305
390 if (entry->bitmap) { 1306 if (entry->bitmap) {
391 ret = search_bitmap(block_group, entry, offset, bytes); 1307 ret = search_bitmap(ctl, entry, offset, bytes);
392 if (!ret) 1308 if (!ret)
393 return entry; 1309 return entry;
394 continue; 1310 continue;
@@ -402,23 +1318,28 @@ static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
402 return NULL; 1318 return NULL;
403} 1319}
404 1320
405static void add_new_bitmap(struct btrfs_block_group_cache *block_group, 1321static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
406 struct btrfs_free_space *info, u64 offset) 1322 struct btrfs_free_space *info, u64 offset)
407{ 1323{
408 u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; 1324 info->offset = offset_to_bitmap(ctl, offset);
409 int max_bitmaps = (int)div64_u64(block_group->key.offset +
410 bytes_per_bg - 1, bytes_per_bg);
411 BUG_ON(block_group->total_bitmaps >= max_bitmaps);
412
413 info->offset = offset_to_bitmap(block_group, offset);
414 info->bytes = 0; 1325 info->bytes = 0;
415 link_free_space(block_group, info); 1326 link_free_space(ctl, info);
416 block_group->total_bitmaps++; 1327 ctl->total_bitmaps++;
417 1328
418 recalculate_thresholds(block_group); 1329 ctl->op->recalc_thresholds(ctl);
419} 1330}
420 1331
421static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, 1332static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1333 struct btrfs_free_space *bitmap_info)
1334{
1335 unlink_free_space(ctl, bitmap_info);
1336 kfree(bitmap_info->bitmap);
1337 kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1338 ctl->total_bitmaps--;
1339 ctl->op->recalc_thresholds(ctl);
1340}
1341
1342static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
422 struct btrfs_free_space *bitmap_info, 1343 struct btrfs_free_space *bitmap_info,
423 u64 *offset, u64 *bytes) 1344 u64 *offset, u64 *bytes)
424{ 1345{
@@ -427,8 +1348,7 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro
427 int ret; 1348 int ret;
428 1349
429again: 1350again:
430 end = bitmap_info->offset + 1351 end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
431 (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
432 1352
433 /* 1353 /*
434 * XXX - this can go away after a few releases. 1354 * XXX - this can go away after a few releases.
@@ -442,29 +1362,23 @@ again:
442 */ 1362 */
443 search_start = *offset; 1363 search_start = *offset;
444 search_bytes = *bytes; 1364 search_bytes = *bytes;
445 ret = search_bitmap(block_group, bitmap_info, &search_start, 1365 search_bytes = min(search_bytes, end - search_start + 1);
446 &search_bytes); 1366 ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
447 BUG_ON(ret < 0 || search_start != *offset); 1367 BUG_ON(ret < 0 || search_start != *offset);
448 1368
449 if (*offset > bitmap_info->offset && *offset + *bytes > end) { 1369 if (*offset > bitmap_info->offset && *offset + *bytes > end) {
450 bitmap_clear_bits(block_group, bitmap_info, *offset, 1370 bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
451 end - *offset + 1);
452 *bytes -= end - *offset + 1; 1371 *bytes -= end - *offset + 1;
453 *offset = end + 1; 1372 *offset = end + 1;
454 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { 1373 } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
455 bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); 1374 bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
456 *bytes = 0; 1375 *bytes = 0;
457 } 1376 }
458 1377
459 if (*bytes) { 1378 if (*bytes) {
460 struct rb_node *next = rb_next(&bitmap_info->offset_index); 1379 struct rb_node *next = rb_next(&bitmap_info->offset_index);
461 if (!bitmap_info->bytes) { 1380 if (!bitmap_info->bytes)
462 unlink_free_space(block_group, bitmap_info); 1381 free_bitmap(ctl, bitmap_info);
463 kfree(bitmap_info->bitmap);
464 kfree(bitmap_info);
465 block_group->total_bitmaps--;
466 recalculate_thresholds(block_group);
467 }
468 1382
469 /* 1383 /*
470 * no entry after this bitmap, but we still have bytes to 1384 * no entry after this bitmap, but we still have bytes to
@@ -491,38 +1405,59 @@ again:
491 */ 1405 */
492 search_start = *offset; 1406 search_start = *offset;
493 search_bytes = *bytes; 1407 search_bytes = *bytes;
494 ret = search_bitmap(block_group, bitmap_info, &search_start, 1408 ret = search_bitmap(ctl, bitmap_info, &search_start,
495 &search_bytes); 1409 &search_bytes);
496 if (ret < 0 || search_start != *offset) 1410 if (ret < 0 || search_start != *offset)
497 return -EAGAIN; 1411 return -EAGAIN;
498 1412
499 goto again; 1413 goto again;
500 } else if (!bitmap_info->bytes) { 1414 } else if (!bitmap_info->bytes)
501 unlink_free_space(block_group, bitmap_info); 1415 free_bitmap(ctl, bitmap_info);
502 kfree(bitmap_info->bitmap);
503 kfree(bitmap_info);
504 block_group->total_bitmaps--;
505 recalculate_thresholds(block_group);
506 }
507 1416
508 return 0; 1417 return 0;
509} 1418}
510 1419
511static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, 1420static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
512 struct btrfs_free_space *info) 1421 struct btrfs_free_space *info, u64 offset,
1422 u64 bytes)
513{ 1423{
514 struct btrfs_free_space *bitmap_info; 1424 u64 bytes_to_set = 0;
515 int added = 0; 1425 u64 end;
516 u64 bytes, offset, end; 1426
517 int ret; 1427 end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1428
1429 bytes_to_set = min(end - offset, bytes);
1430
1431 bitmap_set_bits(ctl, info, offset, bytes_to_set);
1432
1433 return bytes_to_set;
1434
1435}
1436
1437static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1438 struct btrfs_free_space *info)
1439{
1440 struct btrfs_block_group_cache *block_group = ctl->private;
518 1441
519 /* 1442 /*
520 * If we are below the extents threshold then we can add this as an 1443 * If we are below the extents threshold then we can add this as an
521 * extent, and don't have to deal with the bitmap 1444 * extent, and don't have to deal with the bitmap
522 */ 1445 */
523 if (block_group->free_extents < block_group->extents_thresh && 1446 if (ctl->free_extents < ctl->extents_thresh) {
524 info->bytes > block_group->sectorsize * 4) 1447 /*
525 return 0; 1448 * If this block group has some small extents we don't want to
1449 * use up all of our free slots in the cache with them, we want
1450 * to reserve them to larger extents, however if we have plent
1451 * of cache left then go ahead an dadd them, no sense in adding
1452 * the overhead of a bitmap if we don't have to.
1453 */
1454 if (info->bytes <= block_group->sectorsize * 4) {
1455 if (ctl->free_extents * 2 <= ctl->extents_thresh)
1456 return false;
1457 } else {
1458 return false;
1459 }
1460 }
526 1461
527 /* 1462 /*
528 * some block groups are so tiny they can't be enveloped by a bitmap, so 1463 * some block groups are so tiny they can't be enveloped by a bitmap, so
@@ -530,35 +1465,85 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
530 */ 1465 */
531 if (BITS_PER_BITMAP * block_group->sectorsize > 1466 if (BITS_PER_BITMAP * block_group->sectorsize >
532 block_group->key.offset) 1467 block_group->key.offset)
533 return 0; 1468 return false;
1469
1470 return true;
1471}
1472
1473static struct btrfs_free_space_op free_space_op = {
1474 .recalc_thresholds = recalculate_thresholds,
1475 .use_bitmap = use_bitmap,
1476};
1477
1478static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
1479 struct btrfs_free_space *info)
1480{
1481 struct btrfs_free_space *bitmap_info;
1482 struct btrfs_block_group_cache *block_group = NULL;
1483 int added = 0;
1484 u64 bytes, offset, bytes_added;
1485 int ret;
534 1486
535 bytes = info->bytes; 1487 bytes = info->bytes;
536 offset = info->offset; 1488 offset = info->offset;
537 1489
1490 if (!ctl->op->use_bitmap(ctl, info))
1491 return 0;
1492
1493 if (ctl->op == &free_space_op)
1494 block_group = ctl->private;
538again: 1495again:
539 bitmap_info = tree_search_offset(block_group, 1496 /*
540 offset_to_bitmap(block_group, offset), 1497 * Since we link bitmaps right into the cluster we need to see if we
1498 * have a cluster here, and if so and it has our bitmap we need to add
1499 * the free space to that bitmap.
1500 */
1501 if (block_group && !list_empty(&block_group->cluster_list)) {
1502 struct btrfs_free_cluster *cluster;
1503 struct rb_node *node;
1504 struct btrfs_free_space *entry;
1505
1506 cluster = list_entry(block_group->cluster_list.next,
1507 struct btrfs_free_cluster,
1508 block_group_list);
1509 spin_lock(&cluster->lock);
1510 node = rb_first(&cluster->root);
1511 if (!node) {
1512 spin_unlock(&cluster->lock);
1513 goto no_cluster_bitmap;
1514 }
1515
1516 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1517 if (!entry->bitmap) {
1518 spin_unlock(&cluster->lock);
1519 goto no_cluster_bitmap;
1520 }
1521
1522 if (entry->offset == offset_to_bitmap(ctl, offset)) {
1523 bytes_added = add_bytes_to_bitmap(ctl, entry,
1524 offset, bytes);
1525 bytes -= bytes_added;
1526 offset += bytes_added;
1527 }
1528 spin_unlock(&cluster->lock);
1529 if (!bytes) {
1530 ret = 1;
1531 goto out;
1532 }
1533 }
1534
1535no_cluster_bitmap:
1536 bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
541 1, 0); 1537 1, 0);
542 if (!bitmap_info) { 1538 if (!bitmap_info) {
543 BUG_ON(added); 1539 BUG_ON(added);
544 goto new_bitmap; 1540 goto new_bitmap;
545 } 1541 }
546 1542
547 end = bitmap_info->offset + 1543 bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
548 (u64)(BITS_PER_BITMAP * block_group->sectorsize); 1544 bytes -= bytes_added;
549 1545 offset += bytes_added;
550 if (offset >= bitmap_info->offset && offset + bytes > end) { 1546 added = 0;
551 bitmap_set_bits(block_group, bitmap_info, offset,
552 end - offset);
553 bytes -= end - offset;
554 offset = end;
555 added = 0;
556 } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
557 bitmap_set_bits(block_group, bitmap_info, offset, bytes);
558 bytes = 0;
559 } else {
560 BUG();
561 }
562 1547
563 if (!bytes) { 1548 if (!bytes) {
564 ret = 1; 1549 ret = 1;
@@ -568,19 +1553,19 @@ again:
568 1553
569new_bitmap: 1554new_bitmap:
570 if (info && info->bitmap) { 1555 if (info && info->bitmap) {
571 add_new_bitmap(block_group, info, offset); 1556 add_new_bitmap(ctl, info, offset);
572 added = 1; 1557 added = 1;
573 info = NULL; 1558 info = NULL;
574 goto again; 1559 goto again;
575 } else { 1560 } else {
576 spin_unlock(&block_group->tree_lock); 1561 spin_unlock(&ctl->tree_lock);
577 1562
578 /* no pre-allocated info, allocate a new one */ 1563 /* no pre-allocated info, allocate a new one */
579 if (!info) { 1564 if (!info) {
580 info = kzalloc(sizeof(struct btrfs_free_space), 1565 info = kmem_cache_zalloc(btrfs_free_space_cachep,
581 GFP_NOFS); 1566 GFP_NOFS);
582 if (!info) { 1567 if (!info) {
583 spin_lock(&block_group->tree_lock); 1568 spin_lock(&ctl->tree_lock);
584 ret = -ENOMEM; 1569 ret = -ENOMEM;
585 goto out; 1570 goto out;
586 } 1571 }
@@ -588,7 +1573,7 @@ new_bitmap:
588 1573
589 /* allocate the bitmap */ 1574 /* allocate the bitmap */
590 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); 1575 info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
591 spin_lock(&block_group->tree_lock); 1576 spin_lock(&ctl->tree_lock);
592 if (!info->bitmap) { 1577 if (!info->bitmap) {
593 ret = -ENOMEM; 1578 ret = -ENOMEM;
594 goto out; 1579 goto out;
@@ -600,77 +1585,94 @@ out:
600 if (info) { 1585 if (info) {
601 if (info->bitmap) 1586 if (info->bitmap)
602 kfree(info->bitmap); 1587 kfree(info->bitmap);
603 kfree(info); 1588 kmem_cache_free(btrfs_free_space_cachep, info);
604 } 1589 }
605 1590
606 return ret; 1591 return ret;
607} 1592}
608 1593
609int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 1594static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
610 u64 offset, u64 bytes) 1595 struct btrfs_free_space *info, bool update_stat)
611{ 1596{
612 struct btrfs_free_space *right_info = NULL; 1597 struct btrfs_free_space *left_info;
613 struct btrfs_free_space *left_info = NULL; 1598 struct btrfs_free_space *right_info;
614 struct btrfs_free_space *info = NULL; 1599 bool merged = false;
615 int ret = 0; 1600 u64 offset = info->offset;
616 1601 u64 bytes = info->bytes;
617 info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
618 if (!info)
619 return -ENOMEM;
620
621 info->offset = offset;
622 info->bytes = bytes;
623
624 spin_lock(&block_group->tree_lock);
625 1602
626 /* 1603 /*
627 * first we want to see if there is free space adjacent to the range we 1604 * first we want to see if there is free space adjacent to the range we
628 * are adding, if there is remove that struct and add a new one to 1605 * are adding, if there is remove that struct and add a new one to
629 * cover the entire range 1606 * cover the entire range
630 */ 1607 */
631 right_info = tree_search_offset(block_group, offset + bytes, 0, 0); 1608 right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
632 if (right_info && rb_prev(&right_info->offset_index)) 1609 if (right_info && rb_prev(&right_info->offset_index))
633 left_info = rb_entry(rb_prev(&right_info->offset_index), 1610 left_info = rb_entry(rb_prev(&right_info->offset_index),
634 struct btrfs_free_space, offset_index); 1611 struct btrfs_free_space, offset_index);
635 else 1612 else
636 left_info = tree_search_offset(block_group, offset - 1, 0, 0); 1613 left_info = tree_search_offset(ctl, offset - 1, 0, 0);
637
638 /*
639 * If there was no extent directly to the left or right of this new
640 * extent then we know we're going to have to allocate a new extent, so
641 * before we do that see if we need to drop this into a bitmap
642 */
643 if ((!left_info || left_info->bitmap) &&
644 (!right_info || right_info->bitmap)) {
645 ret = insert_into_bitmap(block_group, info);
646
647 if (ret < 0) {
648 goto out;
649 } else if (ret) {
650 ret = 0;
651 goto out;
652 }
653 }
654 1614
655 if (right_info && !right_info->bitmap) { 1615 if (right_info && !right_info->bitmap) {
656 unlink_free_space(block_group, right_info); 1616 if (update_stat)
1617 unlink_free_space(ctl, right_info);
1618 else
1619 __unlink_free_space(ctl, right_info);
657 info->bytes += right_info->bytes; 1620 info->bytes += right_info->bytes;
658 kfree(right_info); 1621 kmem_cache_free(btrfs_free_space_cachep, right_info);
1622 merged = true;
659 } 1623 }
660 1624
661 if (left_info && !left_info->bitmap && 1625 if (left_info && !left_info->bitmap &&
662 left_info->offset + left_info->bytes == offset) { 1626 left_info->offset + left_info->bytes == offset) {
663 unlink_free_space(block_group, left_info); 1627 if (update_stat)
1628 unlink_free_space(ctl, left_info);
1629 else
1630 __unlink_free_space(ctl, left_info);
664 info->offset = left_info->offset; 1631 info->offset = left_info->offset;
665 info->bytes += left_info->bytes; 1632 info->bytes += left_info->bytes;
666 kfree(left_info); 1633 kmem_cache_free(btrfs_free_space_cachep, left_info);
1634 merged = true;
667 } 1635 }
668 1636
669 ret = link_free_space(block_group, info); 1637 return merged;
1638}
1639
1640int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
1641 u64 offset, u64 bytes)
1642{
1643 struct btrfs_free_space *info;
1644 int ret = 0;
1645
1646 info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
1647 if (!info)
1648 return -ENOMEM;
1649
1650 info->offset = offset;
1651 info->bytes = bytes;
1652
1653 spin_lock(&ctl->tree_lock);
1654
1655 if (try_merge_free_space(ctl, info, true))
1656 goto link;
1657
1658 /*
1659 * There was no extent directly to the left or right of this new
1660 * extent then we know we're going to have to allocate a new extent, so
1661 * before we do that see if we need to drop this into a bitmap
1662 */
1663 ret = insert_into_bitmap(ctl, info);
1664 if (ret < 0) {
1665 goto out;
1666 } else if (ret) {
1667 ret = 0;
1668 goto out;
1669 }
1670link:
1671 ret = link_free_space(ctl, info);
670 if (ret) 1672 if (ret)
671 kfree(info); 1673 kmem_cache_free(btrfs_free_space_cachep, info);
672out: 1674out:
673 spin_unlock(&block_group->tree_lock); 1675 spin_unlock(&ctl->tree_lock);
674 1676
675 if (ret) { 1677 if (ret) {
676 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); 1678 printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
@@ -683,21 +1685,21 @@ out:
683int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 1685int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
684 u64 offset, u64 bytes) 1686 u64 offset, u64 bytes)
685{ 1687{
1688 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
686 struct btrfs_free_space *info; 1689 struct btrfs_free_space *info;
687 struct btrfs_free_space *next_info = NULL; 1690 struct btrfs_free_space *next_info = NULL;
688 int ret = 0; 1691 int ret = 0;
689 1692
690 spin_lock(&block_group->tree_lock); 1693 spin_lock(&ctl->tree_lock);
691 1694
692again: 1695again:
693 info = tree_search_offset(block_group, offset, 0, 0); 1696 info = tree_search_offset(ctl, offset, 0, 0);
694 if (!info) { 1697 if (!info) {
695 /* 1698 /*
696 * oops didn't find an extent that matched the space we wanted 1699 * oops didn't find an extent that matched the space we wanted
697 * to remove, look for a bitmap instead 1700 * to remove, look for a bitmap instead
698 */ 1701 */
699 info = tree_search_offset(block_group, 1702 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
700 offset_to_bitmap(block_group, offset),
701 1, 0); 1703 1, 0);
702 if (!info) { 1704 if (!info) {
703 WARN_ON(1); 1705 WARN_ON(1);
@@ -712,8 +1714,8 @@ again:
712 offset_index); 1714 offset_index);
713 1715
714 if (next_info->bitmap) 1716 if (next_info->bitmap)
715 end = next_info->offset + BITS_PER_BITMAP * 1717 end = next_info->offset +
716 block_group->sectorsize - 1; 1718 BITS_PER_BITMAP * ctl->unit - 1;
717 else 1719 else
718 end = next_info->offset + next_info->bytes; 1720 end = next_info->offset + next_info->bytes;
719 1721
@@ -733,20 +1735,20 @@ again:
733 } 1735 }
734 1736
735 if (info->bytes == bytes) { 1737 if (info->bytes == bytes) {
736 unlink_free_space(block_group, info); 1738 unlink_free_space(ctl, info);
737 if (info->bitmap) { 1739 if (info->bitmap) {
738 kfree(info->bitmap); 1740 kfree(info->bitmap);
739 block_group->total_bitmaps--; 1741 ctl->total_bitmaps--;
740 } 1742 }
741 kfree(info); 1743 kmem_cache_free(btrfs_free_space_cachep, info);
742 goto out_lock; 1744 goto out_lock;
743 } 1745 }
744 1746
745 if (!info->bitmap && info->offset == offset) { 1747 if (!info->bitmap && info->offset == offset) {
746 unlink_free_space(block_group, info); 1748 unlink_free_space(ctl, info);
747 info->offset += bytes; 1749 info->offset += bytes;
748 info->bytes -= bytes; 1750 info->bytes -= bytes;
749 link_free_space(block_group, info); 1751 link_free_space(ctl, info);
750 goto out_lock; 1752 goto out_lock;
751 } 1753 }
752 1754
@@ -760,13 +1762,13 @@ again:
760 * first unlink the old info and then 1762 * first unlink the old info and then
761 * insert it again after the hole we're creating 1763 * insert it again after the hole we're creating
762 */ 1764 */
763 unlink_free_space(block_group, info); 1765 unlink_free_space(ctl, info);
764 if (offset + bytes < info->offset + info->bytes) { 1766 if (offset + bytes < info->offset + info->bytes) {
765 u64 old_end = info->offset + info->bytes; 1767 u64 old_end = info->offset + info->bytes;
766 1768
767 info->offset = offset + bytes; 1769 info->offset = offset + bytes;
768 info->bytes = old_end - info->offset; 1770 info->bytes = old_end - info->offset;
769 ret = link_free_space(block_group, info); 1771 ret = link_free_space(ctl, info);
770 WARN_ON(ret); 1772 WARN_ON(ret);
771 if (ret) 1773 if (ret)
772 goto out_lock; 1774 goto out_lock;
@@ -774,9 +1776,9 @@ again:
774 /* the hole we're creating ends at the end 1776 /* the hole we're creating ends at the end
775 * of the info struct, just free the info 1777 * of the info struct, just free the info
776 */ 1778 */
777 kfree(info); 1779 kmem_cache_free(btrfs_free_space_cachep, info);
778 } 1780 }
779 spin_unlock(&block_group->tree_lock); 1781 spin_unlock(&ctl->tree_lock);
780 1782
781 /* step two, insert a new info struct to cover 1783 /* step two, insert a new info struct to cover
782 * anything before the hole 1784 * anything before the hole
@@ -787,12 +1789,12 @@ again:
787 goto out; 1789 goto out;
788 } 1790 }
789 1791
790 ret = remove_from_bitmap(block_group, info, &offset, &bytes); 1792 ret = remove_from_bitmap(ctl, info, &offset, &bytes);
791 if (ret == -EAGAIN) 1793 if (ret == -EAGAIN)
792 goto again; 1794 goto again;
793 BUG_ON(ret); 1795 BUG_ON(ret);
794out_lock: 1796out_lock:
795 spin_unlock(&block_group->tree_lock); 1797 spin_unlock(&ctl->tree_lock);
796out: 1798out:
797 return ret; 1799 return ret;
798} 1800}
@@ -800,11 +1802,12 @@ out:
800void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 1802void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
801 u64 bytes) 1803 u64 bytes)
802{ 1804{
1805 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
803 struct btrfs_free_space *info; 1806 struct btrfs_free_space *info;
804 struct rb_node *n; 1807 struct rb_node *n;
805 int count = 0; 1808 int count = 0;
806 1809
807 for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { 1810 for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
808 info = rb_entry(n, struct btrfs_free_space, offset_index); 1811 info = rb_entry(n, struct btrfs_free_space, offset_index);
809 if (info->bytes >= bytes) 1812 if (info->bytes >= bytes)
810 count++; 1813 count++;
@@ -819,19 +1822,23 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
819 "\n", count); 1822 "\n", count);
820} 1823}
821 1824
822u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) 1825void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
823{ 1826{
824 struct btrfs_free_space *info; 1827 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
825 struct rb_node *n;
826 u64 ret = 0;
827 1828
828 for (n = rb_first(&block_group->free_space_offset); n; 1829 spin_lock_init(&ctl->tree_lock);
829 n = rb_next(n)) { 1830 ctl->unit = block_group->sectorsize;
830 info = rb_entry(n, struct btrfs_free_space, offset_index); 1831 ctl->start = block_group->key.objectid;
831 ret += info->bytes; 1832 ctl->private = block_group;
832 } 1833 ctl->op = &free_space_op;
833 1834
834 return ret; 1835 /*
1836 * we only want to have 32k of ram per block group for keeping
1837 * track of free space, and if we pass 1/2 of that we want to
1838 * start converting things over to using bitmaps
1839 */
1840 ctl->extents_thresh = ((1024 * 32) / 2) /
1841 sizeof(struct btrfs_free_space);
835} 1842}
836 1843
837/* 1844/*
@@ -845,31 +1852,31 @@ __btrfs_return_cluster_to_free_space(
845 struct btrfs_block_group_cache *block_group, 1852 struct btrfs_block_group_cache *block_group,
846 struct btrfs_free_cluster *cluster) 1853 struct btrfs_free_cluster *cluster)
847{ 1854{
1855 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
848 struct btrfs_free_space *entry; 1856 struct btrfs_free_space *entry;
849 struct rb_node *node; 1857 struct rb_node *node;
850 bool bitmap;
851 1858
852 spin_lock(&cluster->lock); 1859 spin_lock(&cluster->lock);
853 if (cluster->block_group != block_group) 1860 if (cluster->block_group != block_group)
854 goto out; 1861 goto out;
855 1862
856 bitmap = cluster->points_to_bitmap;
857 cluster->block_group = NULL; 1863 cluster->block_group = NULL;
858 cluster->window_start = 0; 1864 cluster->window_start = 0;
859 list_del_init(&cluster->block_group_list); 1865 list_del_init(&cluster->block_group_list);
860 cluster->points_to_bitmap = false;
861
862 if (bitmap)
863 goto out;
864 1866
865 node = rb_first(&cluster->root); 1867 node = rb_first(&cluster->root);
866 while (node) { 1868 while (node) {
1869 bool bitmap;
1870
867 entry = rb_entry(node, struct btrfs_free_space, offset_index); 1871 entry = rb_entry(node, struct btrfs_free_space, offset_index);
868 node = rb_next(&entry->offset_index); 1872 node = rb_next(&entry->offset_index);
869 rb_erase(&entry->offset_index, &cluster->root); 1873 rb_erase(&entry->offset_index, &cluster->root);
870 BUG_ON(entry->bitmap); 1874
871 tree_insert_offset(&block_group->free_space_offset, 1875 bitmap = (entry->bitmap != NULL);
872 entry->offset, &entry->offset_index, 0); 1876 if (!bitmap)
1877 try_merge_free_space(ctl, entry, false);
1878 tree_insert_offset(&ctl->free_space_offset,
1879 entry->offset, &entry->offset_index, bitmap);
873 } 1880 }
874 cluster->root = RB_ROOT; 1881 cluster->root = RB_ROOT;
875 1882
@@ -879,14 +1886,41 @@ out:
879 return 0; 1886 return 0;
880} 1887}
881 1888
882void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) 1889void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
883{ 1890{
884 struct btrfs_free_space *info; 1891 struct btrfs_free_space *info;
885 struct rb_node *node; 1892 struct rb_node *node;
1893
1894 while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
1895 info = rb_entry(node, struct btrfs_free_space, offset_index);
1896 if (!info->bitmap) {
1897 unlink_free_space(ctl, info);
1898 kmem_cache_free(btrfs_free_space_cachep, info);
1899 } else {
1900 free_bitmap(ctl, info);
1901 }
1902 if (need_resched()) {
1903 spin_unlock(&ctl->tree_lock);
1904 cond_resched();
1905 spin_lock(&ctl->tree_lock);
1906 }
1907 }
1908}
1909
1910void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
1911{
1912 spin_lock(&ctl->tree_lock);
1913 __btrfs_remove_free_space_cache_locked(ctl);
1914 spin_unlock(&ctl->tree_lock);
1915}
1916
1917void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
1918{
1919 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
886 struct btrfs_free_cluster *cluster; 1920 struct btrfs_free_cluster *cluster;
887 struct list_head *head; 1921 struct list_head *head;
888 1922
889 spin_lock(&block_group->tree_lock); 1923 spin_lock(&ctl->tree_lock);
890 while ((head = block_group->cluster_list.next) != 1924 while ((head = block_group->cluster_list.next) !=
891 &block_group->cluster_list) { 1925 &block_group->cluster_list) {
892 cluster = list_entry(head, struct btrfs_free_cluster, 1926 cluster = list_entry(head, struct btrfs_free_cluster,
@@ -895,62 +1929,46 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
895 WARN_ON(cluster->block_group != block_group); 1929 WARN_ON(cluster->block_group != block_group);
896 __btrfs_return_cluster_to_free_space(block_group, cluster); 1930 __btrfs_return_cluster_to_free_space(block_group, cluster);
897 if (need_resched()) { 1931 if (need_resched()) {
898 spin_unlock(&block_group->tree_lock); 1932 spin_unlock(&ctl->tree_lock);
899 cond_resched();
900 spin_lock(&block_group->tree_lock);
901 }
902 }
903
904 while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
905 info = rb_entry(node, struct btrfs_free_space, offset_index);
906 unlink_free_space(block_group, info);
907 if (info->bitmap)
908 kfree(info->bitmap);
909 kfree(info);
910 if (need_resched()) {
911 spin_unlock(&block_group->tree_lock);
912 cond_resched(); 1933 cond_resched();
913 spin_lock(&block_group->tree_lock); 1934 spin_lock(&ctl->tree_lock);
914 } 1935 }
915 } 1936 }
1937 __btrfs_remove_free_space_cache_locked(ctl);
1938 spin_unlock(&ctl->tree_lock);
916 1939
917 spin_unlock(&block_group->tree_lock);
918} 1940}
919 1941
920u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 1942u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
921 u64 offset, u64 bytes, u64 empty_size) 1943 u64 offset, u64 bytes, u64 empty_size)
922{ 1944{
1945 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
923 struct btrfs_free_space *entry = NULL; 1946 struct btrfs_free_space *entry = NULL;
924 u64 bytes_search = bytes + empty_size; 1947 u64 bytes_search = bytes + empty_size;
925 u64 ret = 0; 1948 u64 ret = 0;
926 1949
927 spin_lock(&block_group->tree_lock); 1950 spin_lock(&ctl->tree_lock);
928 entry = find_free_space(block_group, &offset, &bytes_search, 0); 1951 entry = find_free_space(ctl, &offset, &bytes_search);
929 if (!entry) 1952 if (!entry)
930 goto out; 1953 goto out;
931 1954
932 ret = offset; 1955 ret = offset;
933 if (entry->bitmap) { 1956 if (entry->bitmap) {
934 bitmap_clear_bits(block_group, entry, offset, bytes); 1957 bitmap_clear_bits(ctl, entry, offset, bytes);
935 if (!entry->bytes) { 1958 if (!entry->bytes)
936 unlink_free_space(block_group, entry); 1959 free_bitmap(ctl, entry);
937 kfree(entry->bitmap);
938 kfree(entry);
939 block_group->total_bitmaps--;
940 recalculate_thresholds(block_group);
941 }
942 } else { 1960 } else {
943 unlink_free_space(block_group, entry); 1961 unlink_free_space(ctl, entry);
944 entry->offset += bytes; 1962 entry->offset += bytes;
945 entry->bytes -= bytes; 1963 entry->bytes -= bytes;
946 if (!entry->bytes) 1964 if (!entry->bytes)
947 kfree(entry); 1965 kmem_cache_free(btrfs_free_space_cachep, entry);
948 else 1966 else
949 link_free_space(block_group, entry); 1967 link_free_space(ctl, entry);
950 } 1968 }
951 1969
952out: 1970out:
953 spin_unlock(&block_group->tree_lock); 1971 spin_unlock(&ctl->tree_lock);
954 1972
955 return ret; 1973 return ret;
956} 1974}
@@ -967,6 +1985,7 @@ int btrfs_return_cluster_to_free_space(
967 struct btrfs_block_group_cache *block_group, 1985 struct btrfs_block_group_cache *block_group,
968 struct btrfs_free_cluster *cluster) 1986 struct btrfs_free_cluster *cluster)
969{ 1987{
1988 struct btrfs_free_space_ctl *ctl;
970 int ret; 1989 int ret;
971 1990
972 /* first, get a safe pointer to the block group */ 1991 /* first, get a safe pointer to the block group */
@@ -985,10 +2004,12 @@ int btrfs_return_cluster_to_free_space(
985 atomic_inc(&block_group->count); 2004 atomic_inc(&block_group->count);
986 spin_unlock(&cluster->lock); 2005 spin_unlock(&cluster->lock);
987 2006
2007 ctl = block_group->free_space_ctl;
2008
988 /* now return any extents the cluster had on it */ 2009 /* now return any extents the cluster had on it */
989 spin_lock(&block_group->tree_lock); 2010 spin_lock(&ctl->tree_lock);
990 ret = __btrfs_return_cluster_to_free_space(block_group, cluster); 2011 ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
991 spin_unlock(&block_group->tree_lock); 2012 spin_unlock(&ctl->tree_lock);
992 2013
993 /* finally drop our ref */ 2014 /* finally drop our ref */
994 btrfs_put_block_group(block_group); 2015 btrfs_put_block_group(block_group);
@@ -997,48 +2018,24 @@ int btrfs_return_cluster_to_free_space(
997 2018
998static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, 2019static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
999 struct btrfs_free_cluster *cluster, 2020 struct btrfs_free_cluster *cluster,
2021 struct btrfs_free_space *entry,
1000 u64 bytes, u64 min_start) 2022 u64 bytes, u64 min_start)
1001{ 2023{
1002 struct btrfs_free_space *entry; 2024 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1003 int err; 2025 int err;
1004 u64 search_start = cluster->window_start; 2026 u64 search_start = cluster->window_start;
1005 u64 search_bytes = bytes; 2027 u64 search_bytes = bytes;
1006 u64 ret = 0; 2028 u64 ret = 0;
1007 2029
1008 spin_lock(&block_group->tree_lock);
1009 spin_lock(&cluster->lock);
1010
1011 if (!cluster->points_to_bitmap)
1012 goto out;
1013
1014 if (cluster->block_group != block_group)
1015 goto out;
1016
1017 /*
1018 * search_start is the beginning of the bitmap, but at some point it may
1019 * be a good idea to point to the actual start of the free area in the
1020 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1021 * to 1 to make sure we get the bitmap entry
1022 */
1023 entry = tree_search_offset(block_group,
1024 offset_to_bitmap(block_group, search_start),
1025 1, 0);
1026 if (!entry || !entry->bitmap)
1027 goto out;
1028
1029 search_start = min_start; 2030 search_start = min_start;
1030 search_bytes = bytes; 2031 search_bytes = bytes;
1031 2032
1032 err = search_bitmap(block_group, entry, &search_start, 2033 err = search_bitmap(ctl, entry, &search_start, &search_bytes);
1033 &search_bytes);
1034 if (err) 2034 if (err)
1035 goto out; 2035 return 0;
1036 2036
1037 ret = search_start; 2037 ret = search_start;
1038 bitmap_clear_bits(block_group, entry, ret, bytes); 2038 bitmap_clear_bits(ctl, entry, ret, bytes);
1039out:
1040 spin_unlock(&cluster->lock);
1041 spin_unlock(&block_group->tree_lock);
1042 2039
1043 return ret; 2040 return ret;
1044} 2041}
@@ -1052,14 +2049,11 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1052 struct btrfs_free_cluster *cluster, u64 bytes, 2049 struct btrfs_free_cluster *cluster, u64 bytes,
1053 u64 min_start) 2050 u64 min_start)
1054{ 2051{
2052 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1055 struct btrfs_free_space *entry = NULL; 2053 struct btrfs_free_space *entry = NULL;
1056 struct rb_node *node; 2054 struct rb_node *node;
1057 u64 ret = 0; 2055 u64 ret = 0;
1058 2056
1059 if (cluster->points_to_bitmap)
1060 return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1061 min_start);
1062
1063 spin_lock(&cluster->lock); 2057 spin_lock(&cluster->lock);
1064 if (bytes > cluster->max_size) 2058 if (bytes > cluster->max_size)
1065 goto out; 2059 goto out;
@@ -1072,11 +2066,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1072 goto out; 2066 goto out;
1073 2067
1074 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2068 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1075
1076 while(1) { 2069 while(1) {
1077 if (entry->bytes < bytes || entry->offset < min_start) { 2070 if (entry->bytes < bytes ||
1078 struct rb_node *node; 2071 (!entry->bitmap && entry->offset < min_start)) {
1079
1080 node = rb_next(&entry->offset_index); 2072 node = rb_next(&entry->offset_index);
1081 if (!node) 2073 if (!node)
1082 break; 2074 break;
@@ -1084,20 +2076,52 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1084 offset_index); 2076 offset_index);
1085 continue; 2077 continue;
1086 } 2078 }
1087 ret = entry->offset;
1088 2079
1089 entry->offset += bytes; 2080 if (entry->bitmap) {
1090 entry->bytes -= bytes; 2081 ret = btrfs_alloc_from_bitmap(block_group,
2082 cluster, entry, bytes,
2083 min_start);
2084 if (ret == 0) {
2085 node = rb_next(&entry->offset_index);
2086 if (!node)
2087 break;
2088 entry = rb_entry(node, struct btrfs_free_space,
2089 offset_index);
2090 continue;
2091 }
2092 } else {
1091 2093
1092 if (entry->bytes == 0) { 2094 ret = entry->offset;
1093 rb_erase(&entry->offset_index, &cluster->root); 2095
1094 kfree(entry); 2096 entry->offset += bytes;
2097 entry->bytes -= bytes;
1095 } 2098 }
2099
2100 if (entry->bytes == 0)
2101 rb_erase(&entry->offset_index, &cluster->root);
1096 break; 2102 break;
1097 } 2103 }
1098out: 2104out:
1099 spin_unlock(&cluster->lock); 2105 spin_unlock(&cluster->lock);
1100 2106
2107 if (!ret)
2108 return 0;
2109
2110 spin_lock(&ctl->tree_lock);
2111
2112 ctl->free_space -= bytes;
2113 if (entry->bytes == 0) {
2114 ctl->free_extents--;
2115 if (entry->bitmap) {
2116 kfree(entry->bitmap);
2117 ctl->total_bitmaps--;
2118 ctl->op->recalc_thresholds(ctl);
2119 }
2120 kmem_cache_free(btrfs_free_space_cachep, entry);
2121 }
2122
2123 spin_unlock(&ctl->tree_lock);
2124
1101 return ret; 2125 return ret;
1102} 2126}
1103 2127
@@ -1106,6 +2130,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1106 struct btrfs_free_cluster *cluster, 2130 struct btrfs_free_cluster *cluster,
1107 u64 offset, u64 bytes, u64 min_bytes) 2131 u64 offset, u64 bytes, u64 min_bytes)
1108{ 2132{
2133 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1109 unsigned long next_zero; 2134 unsigned long next_zero;
1110 unsigned long i; 2135 unsigned long i;
1111 unsigned long search_bits; 2136 unsigned long search_bits;
@@ -1113,12 +2138,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1113 unsigned long found_bits; 2138 unsigned long found_bits;
1114 unsigned long start = 0; 2139 unsigned long start = 0;
1115 unsigned long total_found = 0; 2140 unsigned long total_found = 0;
2141 int ret;
1116 bool found = false; 2142 bool found = false;
1117 2143
1118 i = offset_to_bit(entry->offset, block_group->sectorsize, 2144 i = offset_to_bit(entry->offset, block_group->sectorsize,
1119 max_t(u64, offset, entry->offset)); 2145 max_t(u64, offset, entry->offset));
1120 search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); 2146 search_bits = bytes_to_bits(bytes, block_group->sectorsize);
1121 total_bits = bytes_to_bits(bytes, block_group->sectorsize); 2147 total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1122 2148
1123again: 2149again:
1124 found_bits = 0; 2150 found_bits = 0;
@@ -1135,7 +2161,7 @@ again:
1135 } 2161 }
1136 2162
1137 if (!found_bits) 2163 if (!found_bits)
1138 return -1; 2164 return -ENOSPC;
1139 2165
1140 if (!found) { 2166 if (!found) {
1141 start = i; 2167 start = i;
@@ -1159,131 +2185,67 @@ again:
1159 2185
1160 cluster->window_start = start * block_group->sectorsize + 2186 cluster->window_start = start * block_group->sectorsize +
1161 entry->offset; 2187 entry->offset;
1162 cluster->points_to_bitmap = true; 2188 rb_erase(&entry->offset_index, &ctl->free_space_offset);
2189 ret = tree_insert_offset(&cluster->root, entry->offset,
2190 &entry->offset_index, 1);
2191 BUG_ON(ret);
1163 2192
1164 return 0; 2193 return 0;
1165} 2194}
1166 2195
1167/* 2196/*
1168 * here we try to find a cluster of blocks in a block group. The goal 2197 * This searches the block group for just extents to fill the cluster with.
1169 * is to find at least bytes free and up to empty_size + bytes free.
1170 * We might not find them all in one contiguous area.
1171 *
1172 * returns zero and sets up cluster if things worked out, otherwise
1173 * it returns -enospc
1174 */ 2198 */
1175int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 2199static noinline int
1176 struct btrfs_root *root, 2200setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
1177 struct btrfs_block_group_cache *block_group, 2201 struct btrfs_free_cluster *cluster,
1178 struct btrfs_free_cluster *cluster, 2202 struct list_head *bitmaps, u64 offset, u64 bytes,
1179 u64 offset, u64 bytes, u64 empty_size) 2203 u64 min_bytes)
1180{ 2204{
2205 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2206 struct btrfs_free_space *first = NULL;
1181 struct btrfs_free_space *entry = NULL; 2207 struct btrfs_free_space *entry = NULL;
2208 struct btrfs_free_space *prev = NULL;
2209 struct btrfs_free_space *last;
1182 struct rb_node *node; 2210 struct rb_node *node;
1183 struct btrfs_free_space *next;
1184 struct btrfs_free_space *last = NULL;
1185 u64 min_bytes;
1186 u64 window_start; 2211 u64 window_start;
1187 u64 window_free; 2212 u64 window_free;
1188 u64 max_extent = 0; 2213 u64 max_extent;
1189 bool found_bitmap = false; 2214 u64 max_gap = 128 * 1024;
1190 int ret;
1191 2215
1192 /* for metadata, allow allocates with more holes */ 2216 entry = tree_search_offset(ctl, offset, 0, 1);
1193 if (btrfs_test_opt(root, SSD_SPREAD)) { 2217 if (!entry)
1194 min_bytes = bytes + empty_size; 2218 return -ENOSPC;
1195 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1196 /*
1197 * we want to do larger allocations when we are
1198 * flushing out the delayed refs, it helps prevent
1199 * making more work as we go along.
1200 */
1201 if (trans->transaction->delayed_refs.flushing)
1202 min_bytes = max(bytes, (bytes + empty_size) >> 1);
1203 else
1204 min_bytes = max(bytes, (bytes + empty_size) >> 4);
1205 } else
1206 min_bytes = max(bytes, (bytes + empty_size) >> 2);
1207
1208 spin_lock(&block_group->tree_lock);
1209 spin_lock(&cluster->lock);
1210
1211 /* someone already found a cluster, hooray */
1212 if (cluster->block_group) {
1213 ret = 0;
1214 goto out;
1215 }
1216again:
1217 entry = tree_search_offset(block_group, offset, found_bitmap, 1);
1218 if (!entry) {
1219 ret = -ENOSPC;
1220 goto out;
1221 }
1222 2219
1223 /* 2220 /*
1224 * If found_bitmap is true, we exhausted our search for extent entries, 2221 * We don't want bitmaps, so just move along until we find a normal
1225 * and we just want to search all of the bitmaps that we can find, and 2222 * extent entry.
1226 * ignore any extent entries we find.
1227 */ 2223 */
1228 while (entry->bitmap || found_bitmap || 2224 while (entry->bitmap) {
1229 (!entry->bitmap && entry->bytes < min_bytes)) { 2225 if (list_empty(&entry->list))
1230 struct rb_node *node = rb_next(&entry->offset_index); 2226 list_add_tail(&entry->list, bitmaps);
1231 2227 node = rb_next(&entry->offset_index);
1232 if (entry->bitmap && entry->bytes > bytes + empty_size) { 2228 if (!node)
1233 ret = btrfs_bitmap_cluster(block_group, entry, cluster, 2229 return -ENOSPC;
1234 offset, bytes + empty_size,
1235 min_bytes);
1236 if (!ret)
1237 goto got_it;
1238 }
1239
1240 if (!node) {
1241 ret = -ENOSPC;
1242 goto out;
1243 }
1244 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2230 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1245 } 2231 }
1246 2232
1247 /*
1248 * We already searched all the extent entries from the passed in offset
1249 * to the end and didn't find enough space for the cluster, and we also
1250 * didn't find any bitmaps that met our criteria, just go ahead and exit
1251 */
1252 if (found_bitmap) {
1253 ret = -ENOSPC;
1254 goto out;
1255 }
1256
1257 cluster->points_to_bitmap = false;
1258 window_start = entry->offset; 2233 window_start = entry->offset;
1259 window_free = entry->bytes; 2234 window_free = entry->bytes;
1260 last = entry;
1261 max_extent = entry->bytes; 2235 max_extent = entry->bytes;
2236 first = entry;
2237 last = entry;
2238 prev = entry;
1262 2239
1263 while (1) { 2240 while (window_free <= min_bytes) {
1264 /* out window is just right, lets fill it */ 2241 node = rb_next(&entry->offset_index);
1265 if (window_free >= bytes + empty_size) 2242 if (!node)
1266 break; 2243 return -ENOSPC;
1267 2244 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1268 node = rb_next(&last->offset_index);
1269 if (!node) {
1270 if (found_bitmap)
1271 goto again;
1272 ret = -ENOSPC;
1273 goto out;
1274 }
1275 next = rb_entry(node, struct btrfs_free_space, offset_index);
1276 2245
1277 /* 2246 if (entry->bitmap) {
1278 * we found a bitmap, so if this search doesn't result in a 2247 if (list_empty(&entry->list))
1279 * cluster, we know to go and search again for the bitmaps and 2248 list_add_tail(&entry->list, bitmaps);
1280 * start looking for space there
1281 */
1282 if (next->bitmap) {
1283 if (!found_bitmap)
1284 offset = next->offset;
1285 found_bitmap = true;
1286 last = next;
1287 continue; 2249 continue;
1288 } 2250 }
1289 2251
@@ -1291,60 +2253,190 @@ again:
1291 * we haven't filled the empty size and the window is 2253 * we haven't filled the empty size and the window is
1292 * very large. reset and try again 2254 * very large. reset and try again
1293 */ 2255 */
1294 if (next->offset - (last->offset + last->bytes) > 128 * 1024 || 2256 if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
1295 next->offset - window_start > (bytes + empty_size) * 2) { 2257 entry->offset - window_start > (min_bytes * 2)) {
1296 entry = next; 2258 first = entry;
1297 window_start = entry->offset; 2259 window_start = entry->offset;
1298 window_free = entry->bytes; 2260 window_free = entry->bytes;
1299 last = entry; 2261 last = entry;
1300 max_extent = entry->bytes; 2262 max_extent = entry->bytes;
1301 } else { 2263 } else {
1302 last = next; 2264 last = entry;
1303 window_free += next->bytes; 2265 window_free += entry->bytes;
1304 if (entry->bytes > max_extent) 2266 if (entry->bytes > max_extent)
1305 max_extent = entry->bytes; 2267 max_extent = entry->bytes;
1306 } 2268 }
2269 prev = entry;
1307 } 2270 }
1308 2271
1309 cluster->window_start = entry->offset; 2272 cluster->window_start = first->offset;
2273
2274 node = &first->offset_index;
1310 2275
1311 /* 2276 /*
1312 * now we've found our entries, pull them out of the free space 2277 * now we've found our entries, pull them out of the free space
1313 * cache and put them into the cluster rbtree 2278 * cache and put them into the cluster rbtree
1314 *
1315 * The cluster includes an rbtree, but only uses the offset index
1316 * of each free space cache entry.
1317 */ 2279 */
1318 while (1) { 2280 do {
2281 int ret;
2282
2283 entry = rb_entry(node, struct btrfs_free_space, offset_index);
1319 node = rb_next(&entry->offset_index); 2284 node = rb_next(&entry->offset_index);
1320 if (entry->bitmap && node) { 2285 if (entry->bitmap)
1321 entry = rb_entry(node, struct btrfs_free_space,
1322 offset_index);
1323 continue; 2286 continue;
1324 } else if (entry->bitmap && !node) {
1325 break;
1326 }
1327 2287
1328 rb_erase(&entry->offset_index, &block_group->free_space_offset); 2288 rb_erase(&entry->offset_index, &ctl->free_space_offset);
1329 ret = tree_insert_offset(&cluster->root, entry->offset, 2289 ret = tree_insert_offset(&cluster->root, entry->offset,
1330 &entry->offset_index, 0); 2290 &entry->offset_index, 0);
1331 BUG_ON(ret); 2291 BUG_ON(ret);
2292 } while (node && entry != last);
1332 2293
1333 if (!node || entry == last) 2294 cluster->max_size = max_extent;
1334 break;
1335 2295
2296 return 0;
2297}
2298
2299/*
2300 * This specifically looks for bitmaps that may work in the cluster, we assume
2301 * that we have already failed to find extents that will work.
2302 */
2303static noinline int
2304setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2305 struct btrfs_free_cluster *cluster,
2306 struct list_head *bitmaps, u64 offset, u64 bytes,
2307 u64 min_bytes)
2308{
2309 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2310 struct btrfs_free_space *entry;
2311 struct rb_node *node;
2312 int ret = -ENOSPC;
2313
2314 if (ctl->total_bitmaps == 0)
2315 return -ENOSPC;
2316
2317 /*
2318 * First check our cached list of bitmaps and see if there is an entry
2319 * here that will work.
2320 */
2321 list_for_each_entry(entry, bitmaps, list) {
2322 if (entry->bytes < min_bytes)
2323 continue;
2324 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2325 bytes, min_bytes);
2326 if (!ret)
2327 return 0;
2328 }
2329
2330 /*
2331 * If we do have entries on our list and we are here then we didn't find
2332 * anything, so go ahead and get the next entry after the last entry in
2333 * this list and start the search from there.
2334 */
2335 if (!list_empty(bitmaps)) {
2336 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2337 list);
2338 node = rb_next(&entry->offset_index);
2339 if (!node)
2340 return -ENOSPC;
1336 entry = rb_entry(node, struct btrfs_free_space, offset_index); 2341 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2342 goto search;
1337 } 2343 }
1338 2344
1339 cluster->max_size = max_extent; 2345 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
1340got_it: 2346 if (!entry)
1341 ret = 0; 2347 return -ENOSPC;
1342 atomic_inc(&block_group->count); 2348
1343 list_add_tail(&cluster->block_group_list, &block_group->cluster_list); 2349search:
1344 cluster->block_group = block_group; 2350 node = &entry->offset_index;
2351 do {
2352 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2353 node = rb_next(&entry->offset_index);
2354 if (!entry->bitmap)
2355 continue;
2356 if (entry->bytes < min_bytes)
2357 continue;
2358 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2359 bytes, min_bytes);
2360 } while (ret && node);
2361
2362 return ret;
2363}
2364
2365/*
2366 * here we try to find a cluster of blocks in a block group. The goal
2367 * is to find at least bytes free and up to empty_size + bytes free.
2368 * We might not find them all in one contiguous area.
2369 *
2370 * returns zero and sets up cluster if things worked out, otherwise
2371 * it returns -enospc
2372 */
2373int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2374 struct btrfs_root *root,
2375 struct btrfs_block_group_cache *block_group,
2376 struct btrfs_free_cluster *cluster,
2377 u64 offset, u64 bytes, u64 empty_size)
2378{
2379 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2380 struct list_head bitmaps;
2381 struct btrfs_free_space *entry, *tmp;
2382 u64 min_bytes;
2383 int ret;
2384
2385 /* for metadata, allow allocates with more holes */
2386 if (btrfs_test_opt(root, SSD_SPREAD)) {
2387 min_bytes = bytes + empty_size;
2388 } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2389 /*
2390 * we want to do larger allocations when we are
2391 * flushing out the delayed refs, it helps prevent
2392 * making more work as we go along.
2393 */
2394 if (trans->transaction->delayed_refs.flushing)
2395 min_bytes = max(bytes, (bytes + empty_size) >> 1);
2396 else
2397 min_bytes = max(bytes, (bytes + empty_size) >> 4);
2398 } else
2399 min_bytes = max(bytes, (bytes + empty_size) >> 2);
2400
2401 spin_lock(&ctl->tree_lock);
2402
2403 /*
2404 * If we know we don't have enough space to make a cluster don't even
2405 * bother doing all the work to try and find one.
2406 */
2407 if (ctl->free_space < min_bytes) {
2408 spin_unlock(&ctl->tree_lock);
2409 return -ENOSPC;
2410 }
2411
2412 spin_lock(&cluster->lock);
2413
2414 /* someone already found a cluster, hooray */
2415 if (cluster->block_group) {
2416 ret = 0;
2417 goto out;
2418 }
2419
2420 INIT_LIST_HEAD(&bitmaps);
2421 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2422 bytes, min_bytes);
2423 if (ret)
2424 ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2425 offset, bytes, min_bytes);
2426
2427 /* Clear our temporary list */
2428 list_for_each_entry_safe(entry, tmp, &bitmaps, list)
2429 list_del_init(&entry->list);
2430
2431 if (!ret) {
2432 atomic_inc(&block_group->count);
2433 list_add_tail(&cluster->block_group_list,
2434 &block_group->cluster_list);
2435 cluster->block_group = block_group;
2436 }
1345out: 2437out:
1346 spin_unlock(&cluster->lock); 2438 spin_unlock(&cluster->lock);
1347 spin_unlock(&block_group->tree_lock); 2439 spin_unlock(&ctl->tree_lock);
1348 2440
1349 return ret; 2441 return ret;
1350} 2442}
@@ -1358,8 +2450,244 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1358 spin_lock_init(&cluster->refill_lock); 2450 spin_lock_init(&cluster->refill_lock);
1359 cluster->root = RB_ROOT; 2451 cluster->root = RB_ROOT;
1360 cluster->max_size = 0; 2452 cluster->max_size = 0;
1361 cluster->points_to_bitmap = false;
1362 INIT_LIST_HEAD(&cluster->block_group_list); 2453 INIT_LIST_HEAD(&cluster->block_group_list);
1363 cluster->block_group = NULL; 2454 cluster->block_group = NULL;
1364} 2455}
1365 2456
2457int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2458 u64 *trimmed, u64 start, u64 end, u64 minlen)
2459{
2460 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2461 struct btrfs_free_space *entry = NULL;
2462 struct btrfs_fs_info *fs_info = block_group->fs_info;
2463 u64 bytes = 0;
2464 u64 actually_trimmed;
2465 int ret = 0;
2466
2467 *trimmed = 0;
2468
2469 while (start < end) {
2470 spin_lock(&ctl->tree_lock);
2471
2472 if (ctl->free_space < minlen) {
2473 spin_unlock(&ctl->tree_lock);
2474 break;
2475 }
2476
2477 entry = tree_search_offset(ctl, start, 0, 1);
2478 if (!entry)
2479 entry = tree_search_offset(ctl,
2480 offset_to_bitmap(ctl, start),
2481 1, 1);
2482
2483 if (!entry || entry->offset >= end) {
2484 spin_unlock(&ctl->tree_lock);
2485 break;
2486 }
2487
2488 if (entry->bitmap) {
2489 ret = search_bitmap(ctl, entry, &start, &bytes);
2490 if (!ret) {
2491 if (start >= end) {
2492 spin_unlock(&ctl->tree_lock);
2493 break;
2494 }
2495 bytes = min(bytes, end - start);
2496 bitmap_clear_bits(ctl, entry, start, bytes);
2497 if (entry->bytes == 0)
2498 free_bitmap(ctl, entry);
2499 } else {
2500 start = entry->offset + BITS_PER_BITMAP *
2501 block_group->sectorsize;
2502 spin_unlock(&ctl->tree_lock);
2503 ret = 0;
2504 continue;
2505 }
2506 } else {
2507 start = entry->offset;
2508 bytes = min(entry->bytes, end - start);
2509 unlink_free_space(ctl, entry);
2510 kmem_cache_free(btrfs_free_space_cachep, entry);
2511 }
2512
2513 spin_unlock(&ctl->tree_lock);
2514
2515 if (bytes >= minlen) {
2516 int update_ret;
2517 update_ret = btrfs_update_reserved_bytes(block_group,
2518 bytes, 1, 1);
2519
2520 ret = btrfs_error_discard_extent(fs_info->extent_root,
2521 start,
2522 bytes,
2523 &actually_trimmed);
2524
2525 btrfs_add_free_space(block_group, start, bytes);
2526 if (!update_ret)
2527 btrfs_update_reserved_bytes(block_group,
2528 bytes, 0, 1);
2529
2530 if (ret)
2531 break;
2532 *trimmed += actually_trimmed;
2533 }
2534 start += bytes;
2535 bytes = 0;
2536
2537 if (fatal_signal_pending(current)) {
2538 ret = -ERESTARTSYS;
2539 break;
2540 }
2541
2542 cond_resched();
2543 }
2544
2545 return ret;
2546}
2547
2548/*
2549 * Find the left-most item in the cache tree, and then return the
2550 * smallest inode number in the item.
2551 *
2552 * Note: the returned inode number may not be the smallest one in
2553 * the tree, if the left-most item is a bitmap.
2554 */
2555u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
2556{
2557 struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
2558 struct btrfs_free_space *entry = NULL;
2559 u64 ino = 0;
2560
2561 spin_lock(&ctl->tree_lock);
2562
2563 if (RB_EMPTY_ROOT(&ctl->free_space_offset))
2564 goto out;
2565
2566 entry = rb_entry(rb_first(&ctl->free_space_offset),
2567 struct btrfs_free_space, offset_index);
2568
2569 if (!entry->bitmap) {
2570 ino = entry->offset;
2571
2572 unlink_free_space(ctl, entry);
2573 entry->offset++;
2574 entry->bytes--;
2575 if (!entry->bytes)
2576 kmem_cache_free(btrfs_free_space_cachep, entry);
2577 else
2578 link_free_space(ctl, entry);
2579 } else {
2580 u64 offset = 0;
2581 u64 count = 1;
2582 int ret;
2583
2584 ret = search_bitmap(ctl, entry, &offset, &count);
2585 BUG_ON(ret);
2586
2587 ino = offset;
2588 bitmap_clear_bits(ctl, entry, offset, 1);
2589 if (entry->bytes == 0)
2590 free_bitmap(ctl, entry);
2591 }
2592out:
2593 spin_unlock(&ctl->tree_lock);
2594
2595 return ino;
2596}
2597
2598struct inode *lookup_free_ino_inode(struct btrfs_root *root,
2599 struct btrfs_path *path)
2600{
2601 struct inode *inode = NULL;
2602
2603 spin_lock(&root->cache_lock);
2604 if (root->cache_inode)
2605 inode = igrab(root->cache_inode);
2606 spin_unlock(&root->cache_lock);
2607 if (inode)
2608 return inode;
2609
2610 inode = __lookup_free_space_inode(root, path, 0);
2611 if (IS_ERR(inode))
2612 return inode;
2613
2614 spin_lock(&root->cache_lock);
2615 if (!btrfs_fs_closing(root->fs_info))
2616 root->cache_inode = igrab(inode);
2617 spin_unlock(&root->cache_lock);
2618
2619 return inode;
2620}
2621
2622int create_free_ino_inode(struct btrfs_root *root,
2623 struct btrfs_trans_handle *trans,
2624 struct btrfs_path *path)
2625{
2626 return __create_free_space_inode(root, trans, path,
2627 BTRFS_FREE_INO_OBJECTID, 0);
2628}
2629
2630int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
2631{
2632 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
2633 struct btrfs_path *path;
2634 struct inode *inode;
2635 int ret = 0;
2636 u64 root_gen = btrfs_root_generation(&root->root_item);
2637
2638 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
2639 return 0;
2640
2641 /*
2642 * If we're unmounting then just return, since this does a search on the
2643 * normal root and not the commit root and we could deadlock.
2644 */
2645 if (btrfs_fs_closing(fs_info))
2646 return 0;
2647
2648 path = btrfs_alloc_path();
2649 if (!path)
2650 return 0;
2651
2652 inode = lookup_free_ino_inode(root, path);
2653 if (IS_ERR(inode))
2654 goto out;
2655
2656 if (root_gen != BTRFS_I(inode)->generation)
2657 goto out_put;
2658
2659 ret = __load_free_space_cache(root, inode, ctl, path, 0);
2660
2661 if (ret < 0)
2662 printk(KERN_ERR "btrfs: failed to load free ino cache for "
2663 "root %llu\n", root->root_key.objectid);
2664out_put:
2665 iput(inode);
2666out:
2667 btrfs_free_path(path);
2668 return ret;
2669}
2670
2671int btrfs_write_out_ino_cache(struct btrfs_root *root,
2672 struct btrfs_trans_handle *trans,
2673 struct btrfs_path *path)
2674{
2675 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
2676 struct inode *inode;
2677 int ret;
2678
2679 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
2680 return 0;
2681
2682 inode = lookup_free_ino_inode(root, path);
2683 if (IS_ERR(inode))
2684 return 0;
2685
2686 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2687 if (ret < 0)
2688 printk(KERN_ERR "btrfs: failed to write free ino cache "
2689 "for root %llu\n", root->root_key.objectid);
2690
2691 iput(inode);
2692 return ret;
2693}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011b..8f2613f779ed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,17 +27,75 @@ struct btrfs_free_space {
27 struct list_head list; 27 struct list_head list;
28}; 28};
29 29
30int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, 30struct btrfs_free_space_ctl {
31 u64 bytenr, u64 size); 31 spinlock_t tree_lock;
32 struct rb_root free_space_offset;
33 u64 free_space;
34 int extents_thresh;
35 int free_extents;
36 int total_bitmaps;
37 int unit;
38 u64 start;
39 struct btrfs_free_space_op *op;
40 void *private;
41};
42
43struct btrfs_free_space_op {
44 void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
45 bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
46 struct btrfs_free_space *info);
47};
48
49struct inode *lookup_free_space_inode(struct btrfs_root *root,
50 struct btrfs_block_group_cache
51 *block_group, struct btrfs_path *path);
52int create_free_space_inode(struct btrfs_root *root,
53 struct btrfs_trans_handle *trans,
54 struct btrfs_block_group_cache *block_group,
55 struct btrfs_path *path);
56
57int btrfs_truncate_free_space_cache(struct btrfs_root *root,
58 struct btrfs_trans_handle *trans,
59 struct btrfs_path *path,
60 struct inode *inode);
61int load_free_space_cache(struct btrfs_fs_info *fs_info,
62 struct btrfs_block_group_cache *block_group);
63int btrfs_write_out_cache(struct btrfs_root *root,
64 struct btrfs_trans_handle *trans,
65 struct btrfs_block_group_cache *block_group,
66 struct btrfs_path *path);
67
68struct inode *lookup_free_ino_inode(struct btrfs_root *root,
69 struct btrfs_path *path);
70int create_free_ino_inode(struct btrfs_root *root,
71 struct btrfs_trans_handle *trans,
72 struct btrfs_path *path);
73int load_free_ino_cache(struct btrfs_fs_info *fs_info,
74 struct btrfs_root *root);
75int btrfs_write_out_ino_cache(struct btrfs_root *root,
76 struct btrfs_trans_handle *trans,
77 struct btrfs_path *path);
78
79void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
80int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
81 u64 bytenr, u64 size);
82static inline int
83btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
84 u64 bytenr, u64 size)
85{
86 return __btrfs_add_free_space(block_group->free_space_ctl,
87 bytenr, size);
88}
32int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, 89int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
33 u64 bytenr, u64 size); 90 u64 bytenr, u64 size);
91void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
34void btrfs_remove_free_space_cache(struct btrfs_block_group_cache 92void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
35 *block_group); 93 *block_group);
36u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, 94u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
37 u64 offset, u64 bytes, u64 empty_size); 95 u64 offset, u64 bytes, u64 empty_size);
96u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
38void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, 97void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
39 u64 bytes); 98 u64 bytes);
40u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
41int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, 99int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
42 struct btrfs_root *root, 100 struct btrfs_root *root,
43 struct btrfs_block_group_cache *block_group, 101 struct btrfs_block_group_cache *block_group,
@@ -50,4 +108,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
50int btrfs_return_cluster_to_free_space( 108int btrfs_return_cluster_to_free_space(
51 struct btrfs_block_group_cache *block_group, 109 struct btrfs_block_group_cache *block_group,
52 struct btrfs_free_cluster *cluster); 110 struct btrfs_free_cluster *cluster);
111int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
112 u64 *trimmed, u64 start, u64 end, u64 minlen);
53#endif 113#endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 64f1150bb48d..baa74f3db691 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -130,7 +130,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
130 item_size - (ptr + sub_item_len - item_start)); 130 item_size - (ptr + sub_item_len - item_start));
131 ret = btrfs_truncate_item(trans, root, path, 131 ret = btrfs_truncate_item(trans, root, path,
132 item_size - sub_item_len, 1); 132 item_size - sub_item_len, 1);
133 BUG_ON(ret);
134out: 133out:
135 btrfs_free_path(path); 134 btrfs_free_path(path);
136 return ret; 135 return ret;
@@ -167,7 +166,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
167 166
168 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); 167 old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
169 ret = btrfs_extend_item(trans, root, path, ins_len); 168 ret = btrfs_extend_item(trans, root, path, ins_len);
170 BUG_ON(ret);
171 ref = btrfs_item_ptr(path->nodes[0], path->slots[0], 169 ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
172 struct btrfs_inode_ref); 170 struct btrfs_inode_ref);
173 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); 171 ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c56eb5909172..b4087e0fa871 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -16,11 +16,476 @@
16 * Boston, MA 021110-1307, USA. 16 * Boston, MA 021110-1307, USA.
17 */ 17 */
18 18
19#include <linux/delay.h>
20#include <linux/kthread.h>
21#include <linux/pagemap.h>
22
19#include "ctree.h" 23#include "ctree.h"
20#include "disk-io.h" 24#include "disk-io.h"
25#include "free-space-cache.h"
26#include "inode-map.h"
21#include "transaction.h" 27#include "transaction.h"
22 28
23int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) 29static int caching_kthread(void *data)
30{
31 struct btrfs_root *root = data;
32 struct btrfs_fs_info *fs_info = root->fs_info;
33 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
34 struct btrfs_key key;
35 struct btrfs_path *path;
36 struct extent_buffer *leaf;
37 u64 last = (u64)-1;
38 int slot;
39 int ret;
40
41 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
42 return 0;
43
44 path = btrfs_alloc_path();
45 if (!path)
46 return -ENOMEM;
47
48 /* Since the commit root is read-only, we can safely skip locking. */
49 path->skip_locking = 1;
50 path->search_commit_root = 1;
51 path->reada = 2;
52
53 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
54 key.offset = 0;
55 key.type = BTRFS_INODE_ITEM_KEY;
56again:
57 /* need to make sure the commit_root doesn't disappear */
58 mutex_lock(&root->fs_commit_mutex);
59
60 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
61 if (ret < 0)
62 goto out;
63
64 while (1) {
65 if (btrfs_fs_closing(fs_info))
66 goto out;
67
68 leaf = path->nodes[0];
69 slot = path->slots[0];
70 if (slot >= btrfs_header_nritems(leaf)) {
71 ret = btrfs_next_leaf(root, path);
72 if (ret < 0)
73 goto out;
74 else if (ret > 0)
75 break;
76
77 if (need_resched() ||
78 btrfs_transaction_in_commit(fs_info)) {
79 leaf = path->nodes[0];
80
81 if (btrfs_header_nritems(leaf) == 0) {
82 WARN_ON(1);
83 break;
84 }
85
86 /*
87 * Save the key so we can advances forward
88 * in the next search.
89 */
90 btrfs_item_key_to_cpu(leaf, &key, 0);
91 btrfs_release_path(path);
92 root->cache_progress = last;
93 mutex_unlock(&root->fs_commit_mutex);
94 schedule_timeout(1);
95 goto again;
96 } else
97 continue;
98 }
99
100 btrfs_item_key_to_cpu(leaf, &key, slot);
101
102 if (key.type != BTRFS_INODE_ITEM_KEY)
103 goto next;
104
105 if (key.objectid >= root->highest_objectid)
106 break;
107
108 if (last != (u64)-1 && last + 1 != key.objectid) {
109 __btrfs_add_free_space(ctl, last + 1,
110 key.objectid - last - 1);
111 wake_up(&root->cache_wait);
112 }
113
114 last = key.objectid;
115next:
116 path->slots[0]++;
117 }
118
119 if (last < root->highest_objectid - 1) {
120 __btrfs_add_free_space(ctl, last + 1,
121 root->highest_objectid - last - 1);
122 }
123
124 spin_lock(&root->cache_lock);
125 root->cached = BTRFS_CACHE_FINISHED;
126 spin_unlock(&root->cache_lock);
127
128 root->cache_progress = (u64)-1;
129 btrfs_unpin_free_ino(root);
130out:
131 wake_up(&root->cache_wait);
132 mutex_unlock(&root->fs_commit_mutex);
133
134 btrfs_free_path(path);
135
136 return ret;
137}
138
139static void start_caching(struct btrfs_root *root)
140{
141 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
142 struct task_struct *tsk;
143 int ret;
144 u64 objectid;
145
146 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
147 return;
148
149 spin_lock(&root->cache_lock);
150 if (root->cached != BTRFS_CACHE_NO) {
151 spin_unlock(&root->cache_lock);
152 return;
153 }
154
155 root->cached = BTRFS_CACHE_STARTED;
156 spin_unlock(&root->cache_lock);
157
158 ret = load_free_ino_cache(root->fs_info, root);
159 if (ret == 1) {
160 spin_lock(&root->cache_lock);
161 root->cached = BTRFS_CACHE_FINISHED;
162 spin_unlock(&root->cache_lock);
163 return;
164 }
165
166 /*
167 * It can be quite time-consuming to fill the cache by searching
168 * through the extent tree, and this can keep ino allocation path
169 * waiting. Therefore at start we quickly find out the highest
170 * inode number and we know we can use inode numbers which fall in
171 * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
172 */
173 ret = btrfs_find_free_objectid(root, &objectid);
174 if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
175 __btrfs_add_free_space(ctl, objectid,
176 BTRFS_LAST_FREE_OBJECTID - objectid + 1);
177 }
178
179 tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
180 root->root_key.objectid);
181 BUG_ON(IS_ERR(tsk));
182}
183
184int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
185{
186 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
187 return btrfs_find_free_objectid(root, objectid);
188
189again:
190 *objectid = btrfs_find_ino_for_alloc(root);
191
192 if (*objectid != 0)
193 return 0;
194
195 start_caching(root);
196
197 wait_event(root->cache_wait,
198 root->cached == BTRFS_CACHE_FINISHED ||
199 root->free_ino_ctl->free_space > 0);
200
201 if (root->cached == BTRFS_CACHE_FINISHED &&
202 root->free_ino_ctl->free_space == 0)
203 return -ENOSPC;
204 else
205 goto again;
206}
207
208void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
209{
210 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
211 struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
212
213 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
214 return;
215
216again:
217 if (root->cached == BTRFS_CACHE_FINISHED) {
218 __btrfs_add_free_space(ctl, objectid, 1);
219 } else {
220 /*
221 * If we are in the process of caching free ino chunks,
222 * to avoid adding the same inode number to the free_ino
223 * tree twice due to cross transaction, we'll leave it
224 * in the pinned tree until a transaction is committed
225 * or the caching work is done.
226 */
227
228 mutex_lock(&root->fs_commit_mutex);
229 spin_lock(&root->cache_lock);
230 if (root->cached == BTRFS_CACHE_FINISHED) {
231 spin_unlock(&root->cache_lock);
232 mutex_unlock(&root->fs_commit_mutex);
233 goto again;
234 }
235 spin_unlock(&root->cache_lock);
236
237 start_caching(root);
238
239 if (objectid <= root->cache_progress ||
240 objectid > root->highest_objectid)
241 __btrfs_add_free_space(ctl, objectid, 1);
242 else
243 __btrfs_add_free_space(pinned, objectid, 1);
244
245 mutex_unlock(&root->fs_commit_mutex);
246 }
247}
248
249/*
250 * When a transaction is committed, we'll move those inode numbers which
251 * are smaller than root->cache_progress from pinned tree to free_ino tree,
252 * and others will just be dropped, because the commit root we were
253 * searching has changed.
254 *
255 * Must be called with root->fs_commit_mutex held
256 */
257void btrfs_unpin_free_ino(struct btrfs_root *root)
258{
259 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
260 struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
261 struct btrfs_free_space *info;
262 struct rb_node *n;
263 u64 count;
264
265 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
266 return;
267
268 while (1) {
269 n = rb_first(rbroot);
270 if (!n)
271 break;
272
273 info = rb_entry(n, struct btrfs_free_space, offset_index);
274 BUG_ON(info->bitmap);
275
276 if (info->offset > root->cache_progress)
277 goto free;
278 else if (info->offset + info->bytes > root->cache_progress)
279 count = root->cache_progress - info->offset + 1;
280 else
281 count = info->bytes;
282
283 __btrfs_add_free_space(ctl, info->offset, count);
284free:
285 rb_erase(&info->offset_index, rbroot);
286 kfree(info);
287 }
288}
289
290#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
291#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
292
293/*
294 * The goal is to keep the memory used by the free_ino tree won't
295 * exceed the memory if we use bitmaps only.
296 */
297static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
298{
299 struct btrfs_free_space *info;
300 struct rb_node *n;
301 int max_ino;
302 int max_bitmaps;
303
304 n = rb_last(&ctl->free_space_offset);
305 if (!n) {
306 ctl->extents_thresh = INIT_THRESHOLD;
307 return;
308 }
309 info = rb_entry(n, struct btrfs_free_space, offset_index);
310
311 /*
312 * Find the maximum inode number in the filesystem. Note we
313 * ignore the fact that this can be a bitmap, because we are
314 * not doing precise calculation.
315 */
316 max_ino = info->bytes - 1;
317
318 max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
319 if (max_bitmaps <= ctl->total_bitmaps) {
320 ctl->extents_thresh = 0;
321 return;
322 }
323
324 ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
325 PAGE_CACHE_SIZE / sizeof(*info);
326}
327
328/*
329 * We don't fall back to bitmap, if we are below the extents threshold
330 * or this chunk of inode numbers is a big one.
331 */
332static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
333 struct btrfs_free_space *info)
334{
335 if (ctl->free_extents < ctl->extents_thresh ||
336 info->bytes > INODES_PER_BITMAP / 10)
337 return false;
338
339 return true;
340}
341
342static struct btrfs_free_space_op free_ino_op = {
343 .recalc_thresholds = recalculate_thresholds,
344 .use_bitmap = use_bitmap,
345};
346
347static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
348{
349}
350
351static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
352 struct btrfs_free_space *info)
353{
354 /*
355 * We always use extents for two reasons:
356 *
357 * - The pinned tree is only used during the process of caching
358 * work.
359 * - Make code simpler. See btrfs_unpin_free_ino().
360 */
361 return false;
362}
363
364static struct btrfs_free_space_op pinned_free_ino_op = {
365 .recalc_thresholds = pinned_recalc_thresholds,
366 .use_bitmap = pinned_use_bitmap,
367};
368
369void btrfs_init_free_ino_ctl(struct btrfs_root *root)
370{
371 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
372 struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
373
374 spin_lock_init(&ctl->tree_lock);
375 ctl->unit = 1;
376 ctl->start = 0;
377 ctl->private = NULL;
378 ctl->op = &free_ino_op;
379
380 /*
381 * Initially we allow to use 16K of ram to cache chunks of
382 * inode numbers before we resort to bitmaps. This is somewhat
383 * arbitrary, but it will be adjusted in runtime.
384 */
385 ctl->extents_thresh = INIT_THRESHOLD;
386
387 spin_lock_init(&pinned->tree_lock);
388 pinned->unit = 1;
389 pinned->start = 0;
390 pinned->private = NULL;
391 pinned->extents_thresh = 0;
392 pinned->op = &pinned_free_ino_op;
393}
394
395int btrfs_save_ino_cache(struct btrfs_root *root,
396 struct btrfs_trans_handle *trans)
397{
398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
399 struct btrfs_path *path;
400 struct inode *inode;
401 u64 alloc_hint = 0;
402 int ret;
403 int prealloc;
404 bool retry = false;
405
406 /* only fs tree and subvol/snap needs ino cache */
407 if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
408 (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
409 root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
410 return 0;
411
412 /* Don't save inode cache if we are deleting this root */
413 if (btrfs_root_refs(&root->root_item) == 0 &&
414 root != root->fs_info->tree_root)
415 return 0;
416
417 if (!btrfs_test_opt(root, INODE_MAP_CACHE))
418 return 0;
419
420 path = btrfs_alloc_path();
421 if (!path)
422 return -ENOMEM;
423
424again:
425 inode = lookup_free_ino_inode(root, path);
426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
427 ret = PTR_ERR(inode);
428 goto out;
429 }
430
431 if (IS_ERR(inode)) {
432 BUG_ON(retry);
433 retry = true;
434
435 ret = create_free_ino_inode(root, trans, path);
436 if (ret)
437 goto out;
438 goto again;
439 }
440
441 BTRFS_I(inode)->generation = 0;
442 ret = btrfs_update_inode(trans, root, inode);
443 WARN_ON(ret);
444
445 if (i_size_read(inode) > 0) {
446 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
447 if (ret)
448 goto out_put;
449 }
450
451 spin_lock(&root->cache_lock);
452 if (root->cached != BTRFS_CACHE_FINISHED) {
453 ret = -1;
454 spin_unlock(&root->cache_lock);
455 goto out_put;
456 }
457 spin_unlock(&root->cache_lock);
458
459 spin_lock(&ctl->tree_lock);
460 prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
461 prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
462 prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
463 spin_unlock(&ctl->tree_lock);
464
465 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE;
467
468 ret = btrfs_check_data_free_space(inode, prealloc);
469 if (ret)
470 goto out_put;
471
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint);
474 if (ret)
475 goto out_put;
476 btrfs_free_reserved_data_space(inode, prealloc);
477
478out_put:
479 iput(inode);
480out:
481 if (ret == 0)
482 ret = btrfs_write_out_ino_cache(root, trans, path);
483
484 btrfs_free_path(path);
485 return ret;
486}
487
488static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
24{ 489{
25 struct btrfs_path *path; 490 struct btrfs_path *path;
26 int ret; 491 int ret;
@@ -30,7 +495,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
30 int slot; 495 int slot;
31 496
32 path = btrfs_alloc_path(); 497 path = btrfs_alloc_path();
33 BUG_ON(!path); 498 if (!path)
499 return -ENOMEM;
34 500
35 search_key.objectid = BTRFS_LAST_FREE_OBJECTID; 501 search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
36 search_key.type = -1; 502 search_key.type = -1;
@@ -54,15 +520,14 @@ error:
54 return ret; 520 return ret;
55} 521}
56 522
57int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, 523int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
58 struct btrfs_root *root,
59 u64 dirid, u64 *objectid)
60{ 524{
61 int ret; 525 int ret;
62 mutex_lock(&root->objectid_mutex); 526 mutex_lock(&root->objectid_mutex);
63 527
64 if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { 528 if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
65 ret = btrfs_find_highest_inode(root, &root->highest_objectid); 529 ret = btrfs_find_highest_objectid(root,
530 &root->highest_objectid);
66 if (ret) 531 if (ret)
67 goto out; 532 goto out;
68 } 533 }
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
new file mode 100644
index 000000000000..ddb347bfee23
--- /dev/null
+++ b/fs/btrfs/inode-map.h
@@ -0,0 +1,13 @@
1#ifndef __BTRFS_INODE_MAP
2#define __BTRFS_INODE_MAP
3
4void btrfs_init_free_ino_ctl(struct btrfs_root *root);
5void btrfs_unpin_free_ino(struct btrfs_root *root);
6void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
7int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
8int btrfs_save_ino_cache(struct btrfs_root *root,
9 struct btrfs_trans_handle *trans);
10
11int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
12
13#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..3601f0aebddf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -37,6 +37,7 @@
37#include <linux/posix_acl.h> 37#include <linux/posix_acl.h>
38#include <linux/falloc.h> 38#include <linux/falloc.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/ratelimit.h>
40#include "compat.h" 41#include "compat.h"
41#include "ctree.h" 42#include "ctree.h"
42#include "disk-io.h" 43#include "disk-io.h"
@@ -50,6 +51,8 @@
50#include "tree-log.h" 51#include "tree-log.h"
51#include "compression.h" 52#include "compression.h"
52#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h"
55#include "inode-map.h"
53 56
54struct btrfs_iget_args { 57struct btrfs_iget_args {
55 u64 ino; 58 u64 ino;
@@ -70,6 +73,7 @@ static struct kmem_cache *btrfs_inode_cachep;
70struct kmem_cache *btrfs_trans_handle_cachep; 73struct kmem_cache *btrfs_trans_handle_cachep;
71struct kmem_cache *btrfs_transaction_cachep; 74struct kmem_cache *btrfs_transaction_cachep;
72struct kmem_cache *btrfs_path_cachep; 75struct kmem_cache *btrfs_path_cachep;
76struct kmem_cache *btrfs_free_space_cachep;
73 77
74#define S_SHIFT 12 78#define S_SHIFT 12
75static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { 79static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -82,7 +86,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
82 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, 86 [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK,
83}; 87};
84 88
85static void btrfs_truncate(struct inode *inode); 89static int btrfs_setsize(struct inode *inode, loff_t newsize);
90static int btrfs_truncate(struct inode *inode);
86static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); 91static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
87static noinline int cow_file_range(struct inode *inode, 92static noinline int cow_file_range(struct inode *inode,
88 struct page *locked_page, 93 struct page *locked_page,
@@ -90,13 +95,14 @@ static noinline int cow_file_range(struct inode *inode,
90 unsigned long *nr_written, int unlock); 95 unsigned long *nr_written, int unlock);
91 96
92static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 97static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
93 struct inode *inode, struct inode *dir) 98 struct inode *inode, struct inode *dir,
99 const struct qstr *qstr)
94{ 100{
95 int err; 101 int err;
96 102
97 err = btrfs_init_acl(trans, inode, dir); 103 err = btrfs_init_acl(trans, inode, dir);
98 if (!err) 104 if (!err)
99 err = btrfs_xattr_security_init(trans, inode, dir); 105 err = btrfs_xattr_security_init(trans, inode, dir, qstr);
100 return err; 106 return err;
101} 107}
102 108
@@ -108,6 +114,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
108static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, 114static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
109 struct btrfs_root *root, struct inode *inode, 115 struct btrfs_root *root, struct inode *inode,
110 u64 start, size_t size, size_t compressed_size, 116 u64 start, size_t size, size_t compressed_size,
117 int compress_type,
111 struct page **compressed_pages) 118 struct page **compressed_pages)
112{ 119{
113 struct btrfs_key key; 120 struct btrfs_key key;
@@ -122,21 +129,17 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
122 size_t cur_size = size; 129 size_t cur_size = size;
123 size_t datasize; 130 size_t datasize;
124 unsigned long offset; 131 unsigned long offset;
125 int use_compress = 0;
126 132
127 if (compressed_size && compressed_pages) { 133 if (compressed_size && compressed_pages)
128 use_compress = 1;
129 cur_size = compressed_size; 134 cur_size = compressed_size;
130 }
131 135
132 path = btrfs_alloc_path(); 136 path = btrfs_alloc_path();
133 if (!path) 137 if (!path)
134 return -ENOMEM; 138 return -ENOMEM;
135 139
136 path->leave_spinning = 1; 140 path->leave_spinning = 1;
137 btrfs_set_trans_block_group(trans, inode);
138 141
139 key.objectid = inode->i_ino; 142 key.objectid = btrfs_ino(inode);
140 key.offset = start; 143 key.offset = start;
141 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 144 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
142 datasize = btrfs_file_extent_calc_inline_size(cur_size); 145 datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -159,7 +162,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
159 btrfs_set_file_extent_ram_bytes(leaf, ei, size); 162 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
160 ptr = btrfs_file_extent_inline_start(ei); 163 ptr = btrfs_file_extent_inline_start(ei);
161 164
162 if (use_compress) { 165 if (compress_type != BTRFS_COMPRESS_NONE) {
163 struct page *cpage; 166 struct page *cpage;
164 int i = 0; 167 int i = 0;
165 while (compressed_size > 0) { 168 while (compressed_size > 0) {
@@ -176,7 +179,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
176 compressed_size -= cur_size; 179 compressed_size -= cur_size;
177 } 180 }
178 btrfs_set_file_extent_compression(leaf, ei, 181 btrfs_set_file_extent_compression(leaf, ei,
179 BTRFS_COMPRESS_ZLIB); 182 compress_type);
180 } else { 183 } else {
181 page = find_get_page(inode->i_mapping, 184 page = find_get_page(inode->i_mapping,
182 start >> PAGE_CACHE_SHIFT); 185 start >> PAGE_CACHE_SHIFT);
@@ -217,7 +220,7 @@ fail:
217static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, 220static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
218 struct btrfs_root *root, 221 struct btrfs_root *root,
219 struct inode *inode, u64 start, u64 end, 222 struct inode *inode, u64 start, u64 end,
220 size_t compressed_size, 223 size_t compressed_size, int compress_type,
221 struct page **compressed_pages) 224 struct page **compressed_pages)
222{ 225{
223 u64 isize = i_size_read(inode); 226 u64 isize = i_size_read(inode);
@@ -250,7 +253,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
250 inline_len = min_t(u64, isize, actual_end); 253 inline_len = min_t(u64, isize, actual_end);
251 ret = insert_inline_extent(trans, root, inode, start, 254 ret = insert_inline_extent(trans, root, inode, start,
252 inline_len, compressed_size, 255 inline_len, compressed_size,
253 compressed_pages); 256 compress_type, compressed_pages);
254 BUG_ON(ret); 257 BUG_ON(ret);
255 btrfs_delalloc_release_metadata(inode, end + 1 - start); 258 btrfs_delalloc_release_metadata(inode, end + 1 - start);
256 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0); 259 btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
@@ -263,6 +266,7 @@ struct async_extent {
263 u64 compressed_size; 266 u64 compressed_size;
264 struct page **pages; 267 struct page **pages;
265 unsigned long nr_pages; 268 unsigned long nr_pages;
269 int compress_type;
266 struct list_head list; 270 struct list_head list;
267}; 271};
268 272
@@ -280,16 +284,19 @@ static noinline int add_async_extent(struct async_cow *cow,
280 u64 start, u64 ram_size, 284 u64 start, u64 ram_size,
281 u64 compressed_size, 285 u64 compressed_size,
282 struct page **pages, 286 struct page **pages,
283 unsigned long nr_pages) 287 unsigned long nr_pages,
288 int compress_type)
284{ 289{
285 struct async_extent *async_extent; 290 struct async_extent *async_extent;
286 291
287 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); 292 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
293 BUG_ON(!async_extent);
288 async_extent->start = start; 294 async_extent->start = start;
289 async_extent->ram_size = ram_size; 295 async_extent->ram_size = ram_size;
290 async_extent->compressed_size = compressed_size; 296 async_extent->compressed_size = compressed_size;
291 async_extent->pages = pages; 297 async_extent->pages = pages;
292 async_extent->nr_pages = nr_pages; 298 async_extent->nr_pages = nr_pages;
299 async_extent->compress_type = compress_type;
293 list_add_tail(&async_extent->list, &cow->extents); 300 list_add_tail(&async_extent->list, &cow->extents);
294 return 0; 301 return 0;
295} 302}
@@ -319,8 +326,6 @@ static noinline int compress_file_range(struct inode *inode,
319 struct btrfs_root *root = BTRFS_I(inode)->root; 326 struct btrfs_root *root = BTRFS_I(inode)->root;
320 struct btrfs_trans_handle *trans; 327 struct btrfs_trans_handle *trans;
321 u64 num_bytes; 328 u64 num_bytes;
322 u64 orig_start;
323 u64 disk_num_bytes;
324 u64 blocksize = root->sectorsize; 329 u64 blocksize = root->sectorsize;
325 u64 actual_end; 330 u64 actual_end;
326 u64 isize = i_size_read(inode); 331 u64 isize = i_size_read(inode);
@@ -334,8 +339,11 @@ static noinline int compress_file_range(struct inode *inode,
334 unsigned long max_uncompressed = 128 * 1024; 339 unsigned long max_uncompressed = 128 * 1024;
335 int i; 340 int i;
336 int will_compress; 341 int will_compress;
342 int compress_type = root->fs_info->compress_type;
337 343
338 orig_start = start; 344 /* if this is a small write inside eof, kick off a defragbot */
345 if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
346 btrfs_add_inode_defrag(NULL, inode);
339 347
340 actual_end = min_t(u64, isize, end + 1); 348 actual_end = min_t(u64, isize, end + 1);
341again: 349again:
@@ -371,7 +379,6 @@ again:
371 total_compressed = min(total_compressed, max_uncompressed); 379 total_compressed = min(total_compressed, max_uncompressed);
372 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 380 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
373 num_bytes = max(blocksize, num_bytes); 381 num_bytes = max(blocksize, num_bytes);
374 disk_num_bytes = num_bytes;
375 total_in = 0; 382 total_in = 0;
376 ret = 0; 383 ret = 0;
377 384
@@ -382,16 +389,22 @@ again:
382 */ 389 */
383 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 390 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
384 (btrfs_test_opt(root, COMPRESS) || 391 (btrfs_test_opt(root, COMPRESS) ||
385 (BTRFS_I(inode)->force_compress))) { 392 (BTRFS_I(inode)->force_compress) ||
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
386 WARN_ON(pages); 394 WARN_ON(pages);
387 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages);
397
398 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress;
388 400
389 ret = btrfs_zlib_compress_pages(inode->i_mapping, start, 401 ret = btrfs_compress_pages(compress_type,
390 total_compressed, pages, 402 inode->i_mapping, start,
391 nr_pages, &nr_pages_ret, 403 total_compressed, pages,
392 &total_in, 404 nr_pages, &nr_pages_ret,
393 &total_compressed, 405 &total_in,
394 max_compressed); 406 &total_compressed,
407 max_compressed);
395 408
396 if (!ret) { 409 if (!ret) {
397 unsigned long offset = total_compressed & 410 unsigned long offset = total_compressed &
@@ -412,9 +425,8 @@ again:
412 } 425 }
413 } 426 }
414 if (start == 0) { 427 if (start == 0) {
415 trans = btrfs_join_transaction(root, 1); 428 trans = btrfs_join_transaction(root);
416 BUG_ON(!trans); 429 BUG_ON(IS_ERR(trans));
417 btrfs_set_trans_block_group(trans, inode);
418 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 430 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
419 431
420 /* lets try to make an inline extent */ 432 /* lets try to make an inline extent */
@@ -423,12 +435,13 @@ again:
423 * to make an uncompressed inline extent. 435 * to make an uncompressed inline extent.
424 */ 436 */
425 ret = cow_file_range_inline(trans, root, inode, 437 ret = cow_file_range_inline(trans, root, inode,
426 start, end, 0, NULL); 438 start, end, 0, 0, NULL);
427 } else { 439 } else {
428 /* try making a compressed inline extent */ 440 /* try making a compressed inline extent */
429 ret = cow_file_range_inline(trans, root, inode, 441 ret = cow_file_range_inline(trans, root, inode,
430 start, end, 442 start, end,
431 total_compressed, pages); 443 total_compressed,
444 compress_type, pages);
432 } 445 }
433 if (ret == 0) { 446 if (ret == 0) {
434 /* 447 /*
@@ -467,7 +480,6 @@ again:
467 if (total_compressed >= total_in) { 480 if (total_compressed >= total_in) {
468 will_compress = 0; 481 will_compress = 0;
469 } else { 482 } else {
470 disk_num_bytes = total_compressed;
471 num_bytes = total_in; 483 num_bytes = total_in;
472 } 484 }
473 } 485 }
@@ -499,9 +511,10 @@ again:
499 * and will submit them to the elevator. 511 * and will submit them to the elevator.
500 */ 512 */
501 add_async_extent(async_cow, start, num_bytes, 513 add_async_extent(async_cow, start, num_bytes,
502 total_compressed, pages, nr_pages_ret); 514 total_compressed, pages, nr_pages_ret,
515 compress_type);
503 516
504 if (start + num_bytes < end && start + num_bytes < actual_end) { 517 if (start + num_bytes < end) {
505 start += num_bytes; 518 start += num_bytes;
506 pages = NULL; 519 pages = NULL;
507 cond_resched(); 520 cond_resched();
@@ -521,7 +534,8 @@ cleanup_and_bail_uncompressed:
521 __set_page_dirty_nobuffers(locked_page); 534 __set_page_dirty_nobuffers(locked_page);
522 /* unlocked later on in the async handlers */ 535 /* unlocked later on in the async handlers */
523 } 536 }
524 add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); 537 add_async_extent(async_cow, start, end - start + 1,
538 0, NULL, 0, BTRFS_COMPRESS_NONE);
525 *num_added += 1; 539 *num_added += 1;
526 } 540 }
527 541
@@ -607,7 +621,9 @@ retry:
607 async_extent->start + async_extent->ram_size - 1, 621 async_extent->start + async_extent->ram_size - 1,
608 GFP_NOFS); 622 GFP_NOFS);
609 623
610 trans = btrfs_join_transaction(root, 1); 624 trans = btrfs_join_transaction(root);
625 BUG_ON(IS_ERR(trans));
626 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
611 ret = btrfs_reserve_extent(trans, root, 627 ret = btrfs_reserve_extent(trans, root,
612 async_extent->compressed_size, 628 async_extent->compressed_size,
613 async_extent->compressed_size, 629 async_extent->compressed_size,
@@ -638,7 +654,8 @@ retry:
638 async_extent->start + 654 async_extent->start +
639 async_extent->ram_size - 1, 0); 655 async_extent->ram_size - 1, 0);
640 656
641 em = alloc_extent_map(GFP_NOFS); 657 em = alloc_extent_map();
658 BUG_ON(!em);
642 em->start = async_extent->start; 659 em->start = async_extent->start;
643 em->len = async_extent->ram_size; 660 em->len = async_extent->ram_size;
644 em->orig_start = em->start; 661 em->orig_start = em->start;
@@ -646,6 +663,7 @@ retry:
646 em->block_start = ins.objectid; 663 em->block_start = ins.objectid;
647 em->block_len = ins.offset; 664 em->block_len = ins.offset;
648 em->bdev = root->fs_info->fs_devices->latest_bdev; 665 em->bdev = root->fs_info->fs_devices->latest_bdev;
666 em->compress_type = async_extent->compress_type;
649 set_bit(EXTENT_FLAG_PINNED, &em->flags); 667 set_bit(EXTENT_FLAG_PINNED, &em->flags);
650 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 668 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
651 669
@@ -662,11 +680,13 @@ retry:
662 async_extent->ram_size - 1, 0); 680 async_extent->ram_size - 1, 0);
663 } 681 }
664 682
665 ret = btrfs_add_ordered_extent(inode, async_extent->start, 683 ret = btrfs_add_ordered_extent_compress(inode,
666 ins.objectid, 684 async_extent->start,
667 async_extent->ram_size, 685 ins.objectid,
668 ins.offset, 686 async_extent->ram_size,
669 BTRFS_ORDERED_COMPRESSED); 687 ins.offset,
688 BTRFS_ORDERED_COMPRESSED,
689 async_extent->compress_type);
670 BUG_ON(ret); 690 BUG_ON(ret);
671 691
672 /* 692 /*
@@ -730,6 +750,15 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
730 return alloc_hint; 750 return alloc_hint;
731} 751}
732 752
753static inline bool is_free_space_inode(struct btrfs_root *root,
754 struct inode *inode)
755{
756 if (root == root->fs_info->tree_root ||
757 BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
758 return true;
759 return false;
760}
761
733/* 762/*
734 * when extent_io.c finds a delayed allocation range in the file, 763 * when extent_io.c finds a delayed allocation range in the file,
735 * the call backs end up in this code. The basic idea is to 764 * the call backs end up in this code. The basic idea is to
@@ -757,29 +786,29 @@ static noinline int cow_file_range(struct inode *inode,
757 u64 disk_num_bytes; 786 u64 disk_num_bytes;
758 u64 cur_alloc_size; 787 u64 cur_alloc_size;
759 u64 blocksize = root->sectorsize; 788 u64 blocksize = root->sectorsize;
760 u64 actual_end;
761 u64 isize = i_size_read(inode);
762 struct btrfs_key ins; 789 struct btrfs_key ins;
763 struct extent_map *em; 790 struct extent_map *em;
764 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 791 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
765 int ret = 0; 792 int ret = 0;
766 793
767 trans = btrfs_join_transaction(root, 1); 794 BUG_ON(is_free_space_inode(root, inode));
768 BUG_ON(!trans); 795 trans = btrfs_join_transaction(root);
769 btrfs_set_trans_block_group(trans, inode); 796 BUG_ON(IS_ERR(trans));
770 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 797 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
771 798
772 actual_end = min_t(u64, isize, end + 1);
773
774 num_bytes = (end - start + blocksize) & ~(blocksize - 1); 799 num_bytes = (end - start + blocksize) & ~(blocksize - 1);
775 num_bytes = max(blocksize, num_bytes); 800 num_bytes = max(blocksize, num_bytes);
776 disk_num_bytes = num_bytes; 801 disk_num_bytes = num_bytes;
777 ret = 0; 802 ret = 0;
778 803
804 /* if this is a small write inside eof, kick off defrag */
805 if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
806 btrfs_add_inode_defrag(trans, inode);
807
779 if (start == 0) { 808 if (start == 0) {
780 /* lets try to make an inline extent */ 809 /* lets try to make an inline extent */
781 ret = cow_file_range_inline(trans, root, inode, 810 ret = cow_file_range_inline(trans, root, inode,
782 start, end, 0, NULL); 811 start, end, 0, 0, NULL);
783 if (ret == 0) { 812 if (ret == 0) {
784 extent_clear_unlock_delalloc(inode, 813 extent_clear_unlock_delalloc(inode,
785 &BTRFS_I(inode)->io_tree, 814 &BTRFS_I(inode)->io_tree,
@@ -814,7 +843,8 @@ static noinline int cow_file_range(struct inode *inode,
814 (u64)-1, &ins, 1); 843 (u64)-1, &ins, 1);
815 BUG_ON(ret); 844 BUG_ON(ret);
816 845
817 em = alloc_extent_map(GFP_NOFS); 846 em = alloc_extent_map();
847 BUG_ON(!em);
818 em->start = start; 848 em->start = start;
819 em->orig_start = em->start; 849 em->orig_start = em->start;
820 ram_size = ins.offset; 850 ram_size = ins.offset;
@@ -941,6 +971,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
941 1, 0, NULL, GFP_NOFS); 971 1, 0, NULL, GFP_NOFS);
942 while (start < end) { 972 while (start < end) {
943 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); 973 async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
974 BUG_ON(!async_cow);
944 async_cow->inode = inode; 975 async_cow->inode = inode;
945 async_cow->root = root; 976 async_cow->root = root;
946 async_cow->locked_page = locked_page; 977 async_cow->locked_page = locked_page;
@@ -994,7 +1025,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
994 LIST_HEAD(list); 1025 LIST_HEAD(list);
995 1026
996 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, 1027 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
997 bytenr + num_bytes - 1, &list); 1028 bytenr + num_bytes - 1, &list, 0);
998 if (ret == 0 && list_empty(&list)) 1029 if (ret == 0 && list_empty(&list))
999 return 0; 1030 return 0;
1000 1031
@@ -1035,23 +1066,33 @@ static noinline int run_delalloc_nocow(struct inode *inode,
1035 int type; 1066 int type;
1036 int nocow; 1067 int nocow;
1037 int check_prev = 1; 1068 int check_prev = 1;
1069 bool nolock;
1070 u64 ino = btrfs_ino(inode);
1038 1071
1039 path = btrfs_alloc_path(); 1072 path = btrfs_alloc_path();
1040 BUG_ON(!path); 1073 BUG_ON(!path);
1041 trans = btrfs_join_transaction(root, 1); 1074
1042 BUG_ON(!trans); 1075 nolock = is_free_space_inode(root, inode);
1076
1077 if (nolock)
1078 trans = btrfs_join_transaction_nolock(root);
1079 else
1080 trans = btrfs_join_transaction(root);
1081
1082 BUG_ON(IS_ERR(trans));
1083 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1043 1084
1044 cow_start = (u64)-1; 1085 cow_start = (u64)-1;
1045 cur_offset = start; 1086 cur_offset = start;
1046 while (1) { 1087 while (1) {
1047 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 1088 ret = btrfs_lookup_file_extent(trans, root, path, ino,
1048 cur_offset, 0); 1089 cur_offset, 0);
1049 BUG_ON(ret < 0); 1090 BUG_ON(ret < 0);
1050 if (ret > 0 && path->slots[0] > 0 && check_prev) { 1091 if (ret > 0 && path->slots[0] > 0 && check_prev) {
1051 leaf = path->nodes[0]; 1092 leaf = path->nodes[0];
1052 btrfs_item_key_to_cpu(leaf, &found_key, 1093 btrfs_item_key_to_cpu(leaf, &found_key,
1053 path->slots[0] - 1); 1094 path->slots[0] - 1);
1054 if (found_key.objectid == inode->i_ino && 1095 if (found_key.objectid == ino &&
1055 found_key.type == BTRFS_EXTENT_DATA_KEY) 1096 found_key.type == BTRFS_EXTENT_DATA_KEY)
1056 path->slots[0]--; 1097 path->slots[0]--;
1057 } 1098 }
@@ -1072,7 +1113,7 @@ next_slot:
1072 num_bytes = 0; 1113 num_bytes = 0;
1073 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 1114 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1074 1115
1075 if (found_key.objectid > inode->i_ino || 1116 if (found_key.objectid > ino ||
1076 found_key.type > BTRFS_EXTENT_DATA_KEY || 1117 found_key.type > BTRFS_EXTENT_DATA_KEY ||
1077 found_key.offset > end) 1118 found_key.offset > end)
1078 break; 1119 break;
@@ -1107,7 +1148,7 @@ next_slot:
1107 goto out_check; 1148 goto out_check;
1108 if (btrfs_extent_readonly(root, disk_bytenr)) 1149 if (btrfs_extent_readonly(root, disk_bytenr))
1109 goto out_check; 1150 goto out_check;
1110 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 1151 if (btrfs_cross_ref_exist(trans, root, ino,
1111 found_key.offset - 1152 found_key.offset -
1112 extent_offset, disk_bytenr)) 1153 extent_offset, disk_bytenr))
1113 goto out_check; 1154 goto out_check;
@@ -1144,7 +1185,7 @@ out_check:
1144 goto next_slot; 1185 goto next_slot;
1145 } 1186 }
1146 1187
1147 btrfs_release_path(root, path); 1188 btrfs_release_path(path);
1148 if (cow_start != (u64)-1) { 1189 if (cow_start != (u64)-1) {
1149 ret = cow_file_range(inode, locked_page, cow_start, 1190 ret = cow_file_range(inode, locked_page, cow_start,
1150 found_key.offset - 1, page_started, 1191 found_key.offset - 1, page_started,
@@ -1157,7 +1198,8 @@ out_check:
1157 struct extent_map *em; 1198 struct extent_map *em;
1158 struct extent_map_tree *em_tree; 1199 struct extent_map_tree *em_tree;
1159 em_tree = &BTRFS_I(inode)->extent_tree; 1200 em_tree = &BTRFS_I(inode)->extent_tree;
1160 em = alloc_extent_map(GFP_NOFS); 1201 em = alloc_extent_map();
1202 BUG_ON(!em);
1161 em->start = cur_offset; 1203 em->start = cur_offset;
1162 em->orig_start = em->start; 1204 em->orig_start = em->start;
1163 em->len = num_bytes; 1205 em->len = num_bytes;
@@ -1201,7 +1243,7 @@ out_check:
1201 if (cur_offset > end) 1243 if (cur_offset > end)
1202 break; 1244 break;
1203 } 1245 }
1204 btrfs_release_path(root, path); 1246 btrfs_release_path(path);
1205 1247
1206 if (cur_offset <= end && cow_start == (u64)-1) 1248 if (cur_offset <= end && cow_start == (u64)-1)
1207 cow_start = cur_offset; 1249 cow_start = cur_offset;
@@ -1211,8 +1253,13 @@ out_check:
1211 BUG_ON(ret); 1253 BUG_ON(ret);
1212 } 1254 }
1213 1255
1214 ret = btrfs_end_transaction(trans, root); 1256 if (nolock) {
1215 BUG_ON(ret); 1257 ret = btrfs_end_transaction_nolock(trans, root);
1258 BUG_ON(ret);
1259 } else {
1260 ret = btrfs_end_transaction(trans, root);
1261 BUG_ON(ret);
1262 }
1216 btrfs_free_path(path); 1263 btrfs_free_path(path);
1217 return 0; 1264 return 0;
1218} 1265}
@@ -1234,7 +1281,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1234 ret = run_delalloc_nocow(inode, locked_page, start, end, 1281 ret = run_delalloc_nocow(inode, locked_page, start, end,
1235 page_started, 0, nr_written); 1282 page_started, 0, nr_written);
1236 else if (!btrfs_test_opt(root, COMPRESS) && 1283 else if (!btrfs_test_opt(root, COMPRESS) &&
1237 !(BTRFS_I(inode)->force_compress)) 1284 !(BTRFS_I(inode)->force_compress) &&
1285 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
1238 ret = cow_file_range(inode, locked_page, start, end, 1286 ret = cow_file_range(inode, locked_page, start, end,
1239 page_started, nr_written, 1); 1287 page_started, nr_written, 1);
1240 else 1288 else
@@ -1283,12 +1331,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
1283 1331
1284 /* 1332 /*
1285 * set_bit and clear bit hooks normally require _irqsave/restore 1333 * set_bit and clear bit hooks normally require _irqsave/restore
1286 * but in this case, we are only testeing for the DELALLOC 1334 * but in this case, we are only testing for the DELALLOC
1287 * bit, which is only set or cleared with irqs on 1335 * bit, which is only set or cleared with irqs on
1288 */ 1336 */
1289 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1337 if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1290 struct btrfs_root *root = BTRFS_I(inode)->root; 1338 struct btrfs_root *root = BTRFS_I(inode)->root;
1291 u64 len = state->end + 1 - state->start; 1339 u64 len = state->end + 1 - state->start;
1340 bool do_list = !is_free_space_inode(root, inode);
1292 1341
1293 if (*bits & EXTENT_FIRST_DELALLOC) 1342 if (*bits & EXTENT_FIRST_DELALLOC)
1294 *bits &= ~EXTENT_FIRST_DELALLOC; 1343 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1298,7 +1347,7 @@ static int btrfs_set_bit_hook(struct inode *inode,
1298 spin_lock(&root->fs_info->delalloc_lock); 1347 spin_lock(&root->fs_info->delalloc_lock);
1299 BTRFS_I(inode)->delalloc_bytes += len; 1348 BTRFS_I(inode)->delalloc_bytes += len;
1300 root->fs_info->delalloc_bytes += len; 1349 root->fs_info->delalloc_bytes += len;
1301 if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1350 if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1302 list_add_tail(&BTRFS_I(inode)->delalloc_inodes, 1351 list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1303 &root->fs_info->delalloc_inodes); 1352 &root->fs_info->delalloc_inodes);
1304 } 1353 }
@@ -1315,12 +1364,13 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1315{ 1364{
1316 /* 1365 /*
1317 * set_bit and clear bit hooks normally require _irqsave/restore 1366 * set_bit and clear bit hooks normally require _irqsave/restore
1318 * but in this case, we are only testeing for the DELALLOC 1367 * but in this case, we are only testing for the DELALLOC
1319 * bit, which is only set or cleared with irqs on 1368 * bit, which is only set or cleared with irqs on
1320 */ 1369 */
1321 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { 1370 if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1322 struct btrfs_root *root = BTRFS_I(inode)->root; 1371 struct btrfs_root *root = BTRFS_I(inode)->root;
1323 u64 len = state->end + 1 - state->start; 1372 u64 len = state->end + 1 - state->start;
1373 bool do_list = !is_free_space_inode(root, inode);
1324 1374
1325 if (*bits & EXTENT_FIRST_DELALLOC) 1375 if (*bits & EXTENT_FIRST_DELALLOC)
1326 *bits &= ~EXTENT_FIRST_DELALLOC; 1376 *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1330,14 +1380,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
1330 if (*bits & EXTENT_DO_ACCOUNTING) 1380 if (*bits & EXTENT_DO_ACCOUNTING)
1331 btrfs_delalloc_release_metadata(inode, len); 1381 btrfs_delalloc_release_metadata(inode, len);
1332 1382
1333 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) 1383 if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1384 && do_list)
1334 btrfs_free_reserved_data_space(inode, len); 1385 btrfs_free_reserved_data_space(inode, len);
1335 1386
1336 spin_lock(&root->fs_info->delalloc_lock); 1387 spin_lock(&root->fs_info->delalloc_lock);
1337 root->fs_info->delalloc_bytes -= len; 1388 root->fs_info->delalloc_bytes -= len;
1338 BTRFS_I(inode)->delalloc_bytes -= len; 1389 BTRFS_I(inode)->delalloc_bytes -= len;
1339 1390
1340 if (BTRFS_I(inode)->delalloc_bytes == 0 && 1391 if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1341 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { 1392 !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1342 list_del_init(&BTRFS_I(inode)->delalloc_inodes); 1393 list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1343 } 1394 }
@@ -1372,7 +1423,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1372 1423
1373 if (map_length < length + size) 1424 if (map_length < length + size)
1374 return 1; 1425 return 1;
1375 return 0; 1426 return ret;
1376} 1427}
1377 1428
1378/* 1429/*
@@ -1426,15 +1477,21 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1426 1477
1427 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; 1478 skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1428 1479
1429 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 1480 if (is_free_space_inode(root, inode))
1481 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
1482 else
1483 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
1430 BUG_ON(ret); 1484 BUG_ON(ret);
1431 1485
1432 if (!(rw & REQ_WRITE)) { 1486 if (!(rw & REQ_WRITE)) {
1433 if (bio_flags & EXTENT_BIO_COMPRESSED) { 1487 if (bio_flags & EXTENT_BIO_COMPRESSED) {
1434 return btrfs_submit_compressed_read(inode, bio, 1488 return btrfs_submit_compressed_read(inode, bio,
1435 mirror_num, bio_flags); 1489 mirror_num, bio_flags);
1436 } else if (!skip_sum) 1490 } else if (!skip_sum) {
1437 btrfs_lookup_bio_sums(root, inode, bio, NULL); 1491 ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1492 if (ret)
1493 return ret;
1494 }
1438 goto mapit; 1495 goto mapit;
1439 } else if (!skip_sum) { 1496 } else if (!skip_sum) {
1440 /* csum items have already been cloned */ 1497 /* csum items have already been cloned */
@@ -1462,8 +1519,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1462{ 1519{
1463 struct btrfs_ordered_sum *sum; 1520 struct btrfs_ordered_sum *sum;
1464 1521
1465 btrfs_set_trans_block_group(trans, inode);
1466
1467 list_for_each_entry(sum, list, list) { 1522 list_for_each_entry(sum, list, list) {
1468 btrfs_csum_file_blocks(trans, 1523 btrfs_csum_file_blocks(trans,
1469 BTRFS_I(inode)->root->fs_info->csum_root, sum); 1524 BTRFS_I(inode)->root->fs_info->csum_root, sum);
@@ -1534,6 +1589,7 @@ out:
1534out_page: 1589out_page:
1535 unlock_page(page); 1590 unlock_page(page);
1536 page_cache_release(page); 1591 page_cache_release(page);
1592 kfree(fixup);
1537} 1593}
1538 1594
1539/* 1595/*
@@ -1605,7 +1661,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1605 &hint, 0); 1661 &hint, 0);
1606 BUG_ON(ret); 1662 BUG_ON(ret);
1607 1663
1608 ins.objectid = inode->i_ino; 1664 ins.objectid = btrfs_ino(inode);
1609 ins.offset = file_pos; 1665 ins.offset = file_pos;
1610 ins.type = BTRFS_EXTENT_DATA_KEY; 1666 ins.type = BTRFS_EXTENT_DATA_KEY;
1611 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); 1667 ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
@@ -1636,7 +1692,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1636 ins.type = BTRFS_EXTENT_ITEM_KEY; 1692 ins.type = BTRFS_EXTENT_ITEM_KEY;
1637 ret = btrfs_alloc_reserved_file_extent(trans, root, 1693 ret = btrfs_alloc_reserved_file_extent(trans, root,
1638 root->root_key.objectid, 1694 root->root_key.objectid,
1639 inode->i_ino, file_pos, &ins); 1695 btrfs_ino(inode), file_pos, &ins);
1640 BUG_ON(ret); 1696 BUG_ON(ret);
1641 btrfs_free_path(path); 1697 btrfs_free_path(path);
1642 1698
@@ -1660,8 +1716,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1660 struct btrfs_ordered_extent *ordered_extent = NULL; 1716 struct btrfs_ordered_extent *ordered_extent = NULL;
1661 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1717 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1662 struct extent_state *cached_state = NULL; 1718 struct extent_state *cached_state = NULL;
1663 int compressed = 0; 1719 int compress_type = 0;
1664 int ret; 1720 int ret;
1721 bool nolock;
1665 1722
1666 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, 1723 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1667 end - start + 1); 1724 end - start + 1);
@@ -1669,12 +1726,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1669 return 0; 1726 return 0;
1670 BUG_ON(!ordered_extent); 1727 BUG_ON(!ordered_extent);
1671 1728
1729 nolock = is_free_space_inode(root, inode);
1730
1672 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1731 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
1673 BUG_ON(!list_empty(&ordered_extent->list)); 1732 BUG_ON(!list_empty(&ordered_extent->list));
1674 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1733 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1675 if (!ret) { 1734 if (!ret) {
1676 trans = btrfs_join_transaction(root, 1); 1735 if (nolock)
1677 btrfs_set_trans_block_group(trans, inode); 1736 trans = btrfs_join_transaction_nolock(root);
1737 else
1738 trans = btrfs_join_transaction(root);
1739 BUG_ON(IS_ERR(trans));
1678 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1740 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1679 ret = btrfs_update_inode(trans, root, inode); 1741 ret = btrfs_update_inode(trans, root, inode);
1680 BUG_ON(ret); 1742 BUG_ON(ret);
@@ -1686,27 +1748,31 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1686 ordered_extent->file_offset + ordered_extent->len - 1, 1748 ordered_extent->file_offset + ordered_extent->len - 1,
1687 0, &cached_state, GFP_NOFS); 1749 0, &cached_state, GFP_NOFS);
1688 1750
1689 trans = btrfs_join_transaction(root, 1); 1751 if (nolock)
1690 btrfs_set_trans_block_group(trans, inode); 1752 trans = btrfs_join_transaction_nolock(root);
1753 else
1754 trans = btrfs_join_transaction(root);
1755 BUG_ON(IS_ERR(trans));
1691 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1756 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1692 1757
1693 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) 1758 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
1694 compressed = 1; 1759 compress_type = ordered_extent->compress_type;
1695 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1760 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1696 BUG_ON(compressed); 1761 BUG_ON(compress_type);
1697 ret = btrfs_mark_extent_written(trans, inode, 1762 ret = btrfs_mark_extent_written(trans, inode,
1698 ordered_extent->file_offset, 1763 ordered_extent->file_offset,
1699 ordered_extent->file_offset + 1764 ordered_extent->file_offset +
1700 ordered_extent->len); 1765 ordered_extent->len);
1701 BUG_ON(ret); 1766 BUG_ON(ret);
1702 } else { 1767 } else {
1768 BUG_ON(root == root->fs_info->tree_root);
1703 ret = insert_reserved_file_extent(trans, inode, 1769 ret = insert_reserved_file_extent(trans, inode,
1704 ordered_extent->file_offset, 1770 ordered_extent->file_offset,
1705 ordered_extent->start, 1771 ordered_extent->start,
1706 ordered_extent->disk_len, 1772 ordered_extent->disk_len,
1707 ordered_extent->len, 1773 ordered_extent->len,
1708 ordered_extent->len, 1774 ordered_extent->len,
1709 compressed, 0, 0, 1775 compress_type, 0, 0,
1710 BTRFS_FILE_EXTENT_REG); 1776 BTRFS_FILE_EXTENT_REG);
1711 unpin_extent_cache(&BTRFS_I(inode)->extent_tree, 1777 unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
1712 ordered_extent->file_offset, 1778 ordered_extent->file_offset,
@@ -1720,13 +1786,22 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1720 add_pending_csums(trans, inode, ordered_extent->file_offset, 1786 add_pending_csums(trans, inode, ordered_extent->file_offset,
1721 &ordered_extent->list); 1787 &ordered_extent->list);
1722 1788
1723 btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1789 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1724 ret = btrfs_update_inode(trans, root, inode); 1790 if (!ret) {
1725 BUG_ON(ret); 1791 ret = btrfs_update_inode(trans, root, inode);
1792 BUG_ON(ret);
1793 }
1794 ret = 0;
1726out: 1795out:
1727 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1796 if (nolock) {
1728 if (trans) 1797 if (trans)
1729 btrfs_end_transaction(trans, root); 1798 btrfs_end_transaction_nolock(trans, root);
1799 } else {
1800 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1801 if (trans)
1802 btrfs_end_transaction(trans, root);
1803 }
1804
1730 /* once for us */ 1805 /* once for us */
1731 btrfs_put_ordered_extent(ordered_extent); 1806 btrfs_put_ordered_extent(ordered_extent);
1732 /* once for the tree */ 1807 /* once for the tree */
@@ -1738,6 +1813,8 @@ out:
1738static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, 1813static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1739 struct extent_state *state, int uptodate) 1814 struct extent_state *state, int uptodate)
1740{ 1815{
1816 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
1817
1741 ClearPagePrivate2(page); 1818 ClearPagePrivate2(page);
1742 return btrfs_finish_ordered_io(page->mapping->host, start, end); 1819 return btrfs_finish_ordered_io(page->mapping->host, start, end);
1743} 1820}
@@ -1793,7 +1870,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1793 } 1870 }
1794 read_unlock(&em_tree->lock); 1871 read_unlock(&em_tree->lock);
1795 1872
1796 if (!em || IS_ERR(em)) { 1873 if (IS_ERR_OR_NULL(em)) {
1797 kfree(failrec); 1874 kfree(failrec);
1798 return -EIO; 1875 return -EIO;
1799 } 1876 }
@@ -1802,6 +1879,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1802 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { 1879 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1803 logical = em->block_start; 1880 logical = em->block_start;
1804 failrec->bio_flags = EXTENT_BIO_COMPRESSED; 1881 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1882 extent_set_compress_type(&failrec->bio_flags,
1883 em->compress_type);
1805 } 1884 }
1806 failrec->logical = logical; 1885 failrec->logical = logical;
1807 free_extent_map(em); 1886 free_extent_map(em);
@@ -1846,10 +1925,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
1846 else 1925 else
1847 rw = READ; 1926 rw = READ;
1848 1927
1849 BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, 1928 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1850 failrec->last_mirror, 1929 failrec->last_mirror,
1851 failrec->bio_flags, 0); 1930 failrec->bio_flags, 0);
1852 return 0; 1931 return ret;
1853} 1932}
1854 1933
1855/* 1934/*
@@ -1865,7 +1944,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1865 1944
1866 private = 0; 1945 private = 0;
1867 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, 1946 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1868 (u64)-1, 1, EXTENT_DIRTY)) { 1947 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1869 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, 1948 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1870 start, &private_failure); 1949 start, &private_failure);
1871 if (ret == 0) { 1950 if (ret == 0) {
@@ -1907,7 +1986,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1907 } 1986 }
1908 1987
1909 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) 1988 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
1910 return 0; 1989 goto good;
1911 1990
1912 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && 1991 if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
1913 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { 1992 test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -1940,12 +2019,11 @@ good:
1940 return 0; 2019 return 0;
1941 2020
1942zeroit: 2021zeroit:
1943 if (printk_ratelimit()) { 2022 printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
1944 printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " 2023 "private %llu\n",
1945 "private %llu\n", page->mapping->host->i_ino, 2024 (unsigned long long)btrfs_ino(page->mapping->host),
1946 (unsigned long long)start, csum, 2025 (unsigned long long)start, csum,
1947 (unsigned long long)private); 2026 (unsigned long long)private);
1948 }
1949 memset(kaddr + offset, 1, end - start + 1); 2027 memset(kaddr + offset, 1, end - start + 1);
1950 flush_dcache_page(page); 2028 flush_dcache_page(page);
1951 kunmap_atomic(kaddr, KM_USER0); 2029 kunmap_atomic(kaddr, KM_USER0);
@@ -2161,8 +2239,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2161 insert = 1; 2239 insert = 1;
2162#endif 2240#endif
2163 insert = 1; 2241 insert = 1;
2164 } else {
2165 WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
2166 } 2242 }
2167 2243
2168 if (!BTRFS_I(inode)->orphan_meta_reserved) { 2244 if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2182,7 +2258,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2182 2258
2183 /* insert an orphan item to track this unlinked/truncated file */ 2259 /* insert an orphan item to track this unlinked/truncated file */
2184 if (insert >= 1) { 2260 if (insert >= 1) {
2185 ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); 2261 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2186 BUG_ON(ret); 2262 BUG_ON(ret);
2187 } 2263 }
2188 2264
@@ -2219,7 +2295,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2219 spin_unlock(&root->orphan_lock); 2295 spin_unlock(&root->orphan_lock);
2220 2296
2221 if (trans && delete_item) { 2297 if (trans && delete_item) {
2222 ret = btrfs_del_orphan_item(trans, root, inode->i_ino); 2298 ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
2223 BUG_ON(ret); 2299 BUG_ON(ret);
2224 } 2300 }
2225 2301
@@ -2233,21 +2309,23 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
2233 * this cleans up any orphans that may be left on the list from the last use 2309 * this cleans up any orphans that may be left on the list from the last use
2234 * of this root. 2310 * of this root.
2235 */ 2311 */
2236void btrfs_orphan_cleanup(struct btrfs_root *root) 2312int btrfs_orphan_cleanup(struct btrfs_root *root)
2237{ 2313{
2238 struct btrfs_path *path; 2314 struct btrfs_path *path;
2239 struct extent_buffer *leaf; 2315 struct extent_buffer *leaf;
2240 struct btrfs_item *item;
2241 struct btrfs_key key, found_key; 2316 struct btrfs_key key, found_key;
2242 struct btrfs_trans_handle *trans; 2317 struct btrfs_trans_handle *trans;
2243 struct inode *inode; 2318 struct inode *inode;
2244 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2319 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2245 2320
2246 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
2247 return; 2322 return 0;
2248 2323
2249 path = btrfs_alloc_path(); 2324 path = btrfs_alloc_path();
2250 BUG_ON(!path); 2325 if (!path) {
2326 ret = -ENOMEM;
2327 goto out;
2328 }
2251 path->reada = -1; 2329 path->reada = -1;
2252 2330
2253 key.objectid = BTRFS_ORPHAN_OBJECTID; 2331 key.objectid = BTRFS_ORPHAN_OBJECTID;
@@ -2256,18 +2334,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2256 2334
2257 while (1) { 2335 while (1) {
2258 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 2336 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2259 if (ret < 0) { 2337 if (ret < 0)
2260 printk(KERN_ERR "Error searching slot for orphan: %d" 2338 goto out;
2261 "\n", ret);
2262 break;
2263 }
2264 2339
2265 /* 2340 /*
2266 * if ret == 0 means we found what we were searching for, which 2341 * if ret == 0 means we found what we were searching for, which
2267 * is weird, but possible, so only screw with path if we didnt 2342 * is weird, but possible, so only screw with path if we didn't
2268 * find the key and see if we have stuff that matches 2343 * find the key and see if we have stuff that matches
2269 */ 2344 */
2270 if (ret > 0) { 2345 if (ret > 0) {
2346 ret = 0;
2271 if (path->slots[0] == 0) 2347 if (path->slots[0] == 0)
2272 break; 2348 break;
2273 path->slots[0]--; 2349 path->slots[0]--;
@@ -2275,7 +2351,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2275 2351
2276 /* pull out the item */ 2352 /* pull out the item */
2277 leaf = path->nodes[0]; 2353 leaf = path->nodes[0];
2278 item = btrfs_item_nr(leaf, path->slots[0]);
2279 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 2354 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2280 2355
2281 /* make sure the item matches what we want */ 2356 /* make sure the item matches what we want */
@@ -2285,7 +2360,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2285 break; 2360 break;
2286 2361
2287 /* release the path since we're done with it */ 2362 /* release the path since we're done with it */
2288 btrfs_release_path(root, path); 2363 btrfs_release_path(path);
2289 2364
2290 /* 2365 /*
2291 * this is where we are basically btrfs_lookup, without the 2366 * this is where we are basically btrfs_lookup, without the
@@ -2296,7 +2371,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2296 found_key.type = BTRFS_INODE_ITEM_KEY; 2371 found_key.type = BTRFS_INODE_ITEM_KEY;
2297 found_key.offset = 0; 2372 found_key.offset = 0;
2298 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2299 BUG_ON(IS_ERR(inode)); 2374 if (IS_ERR(inode)) {
2375 ret = PTR_ERR(inode);
2376 goto out;
2377 }
2300 2378
2301 /* 2379 /*
2302 * add this inode to the orphan list so btrfs_orphan_del does 2380 * add this inode to the orphan list so btrfs_orphan_del does
@@ -2314,6 +2392,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2314 */ 2392 */
2315 if (is_bad_inode(inode)) { 2393 if (is_bad_inode(inode)) {
2316 trans = btrfs_start_transaction(root, 0); 2394 trans = btrfs_start_transaction(root, 0);
2395 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans);
2397 goto out;
2398 }
2317 btrfs_orphan_del(trans, inode); 2399 btrfs_orphan_del(trans, inode);
2318 btrfs_end_transaction(trans, root); 2400 btrfs_end_transaction(trans, root);
2319 iput(inode); 2401 iput(inode);
@@ -2322,17 +2404,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2322 2404
2323 /* if we have links, this was a truncate, lets do that */ 2405 /* if we have links, this was a truncate, lets do that */
2324 if (inode->i_nlink) { 2406 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) {
2408 WARN_ON(1);
2409 iput(inode);
2410 continue;
2411 }
2325 nr_truncate++; 2412 nr_truncate++;
2326 btrfs_truncate(inode); 2413 ret = btrfs_truncate(inode);
2327 } else { 2414 } else {
2328 nr_unlink++; 2415 nr_unlink++;
2329 } 2416 }
2330 2417
2331 /* this will do delete_inode and everything for us */ 2418 /* this will do delete_inode and everything for us */
2332 iput(inode); 2419 iput(inode);
2420 if (ret)
2421 goto out;
2333 } 2422 }
2334 btrfs_free_path(path);
2335
2336 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2423 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2337 2424
2338 if (root->orphan_block_rsv) 2425 if (root->orphan_block_rsv)
@@ -2340,14 +2427,21 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2340 (u64)-1); 2427 (u64)-1);
2341 2428
2342 if (root->orphan_block_rsv || root->orphan_item_inserted) { 2429 if (root->orphan_block_rsv || root->orphan_item_inserted) {
2343 trans = btrfs_join_transaction(root, 1); 2430 trans = btrfs_join_transaction(root);
2344 btrfs_end_transaction(trans, root); 2431 if (!IS_ERR(trans))
2432 btrfs_end_transaction(trans, root);
2345 } 2433 }
2346 2434
2347 if (nr_unlink) 2435 if (nr_unlink)
2348 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); 2436 printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
2349 if (nr_truncate) 2437 if (nr_truncate)
2350 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); 2438 printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
2439
2440out:
2441 if (ret)
2442 printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
2443 btrfs_free_path(path);
2444 return ret;
2351} 2445}
2352 2446
2353/* 2447/*
@@ -2413,12 +2507,17 @@ static void btrfs_read_locked_inode(struct inode *inode)
2413 struct btrfs_root *root = BTRFS_I(inode)->root; 2507 struct btrfs_root *root = BTRFS_I(inode)->root;
2414 struct btrfs_key location; 2508 struct btrfs_key location;
2415 int maybe_acls; 2509 int maybe_acls;
2416 u64 alloc_group_block;
2417 u32 rdev; 2510 u32 rdev;
2418 int ret; 2511 int ret;
2512 bool filled = false;
2513
2514 ret = btrfs_fill_inode(inode, &rdev);
2515 if (!ret)
2516 filled = true;
2419 2517
2420 path = btrfs_alloc_path(); 2518 path = btrfs_alloc_path();
2421 BUG_ON(!path); 2519 BUG_ON(!path);
2520 path->leave_spinning = 1;
2422 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); 2521 memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
2423 2522
2424 ret = btrfs_lookup_inode(NULL, root, path, &location, 0); 2523 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -2426,8 +2525,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
2426 goto make_bad; 2525 goto make_bad;
2427 2526
2428 leaf = path->nodes[0]; 2527 leaf = path->nodes[0];
2528
2529 if (filled)
2530 goto cache_acl;
2531
2429 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2532 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2430 struct btrfs_inode_item); 2533 struct btrfs_inode_item);
2534 if (!leaf->map_token)
2535 map_private_extent_buffer(leaf, (unsigned long)inode_item,
2536 sizeof(struct btrfs_inode_item),
2537 &leaf->map_token, &leaf->kaddr,
2538 &leaf->map_start, &leaf->map_len,
2539 KM_USER1);
2431 2540
2432 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2541 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2433 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2542 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
@@ -2456,21 +2565,22 @@ static void btrfs_read_locked_inode(struct inode *inode)
2456 2565
2457 BTRFS_I(inode)->index_cnt = (u64)-1; 2566 BTRFS_I(inode)->index_cnt = (u64)-1;
2458 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); 2567 BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
2459 2568cache_acl:
2460 alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
2461
2462 /* 2569 /*
2463 * try to precache a NULL acl entry for files that don't have 2570 * try to precache a NULL acl entry for files that don't have
2464 * any xattrs or acls 2571 * any xattrs or acls
2465 */ 2572 */
2466 maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino); 2573 maybe_acls = acls_after_inode_item(leaf, path->slots[0],
2574 btrfs_ino(inode));
2467 if (!maybe_acls) 2575 if (!maybe_acls)
2468 cache_no_acl(inode); 2576 cache_no_acl(inode);
2469 2577
2470 BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, 2578 if (leaf->map_token) {
2471 alloc_group_block, 0); 2579 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2580 leaf->map_token = NULL;
2581 }
2582
2472 btrfs_free_path(path); 2583 btrfs_free_path(path);
2473 inode_item = NULL;
2474 2584
2475 switch (inode->i_mode & S_IFMT) { 2585 switch (inode->i_mode & S_IFMT) {
2476 case S_IFREG: 2586 case S_IFREG:
@@ -2514,6 +2624,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2514 struct btrfs_inode_item *item, 2624 struct btrfs_inode_item *item,
2515 struct inode *inode) 2625 struct inode *inode)
2516{ 2626{
2627 if (!leaf->map_token)
2628 map_private_extent_buffer(leaf, (unsigned long)item,
2629 sizeof(struct btrfs_inode_item),
2630 &leaf->map_token, &leaf->kaddr,
2631 &leaf->map_start, &leaf->map_len,
2632 KM_USER1);
2633
2517 btrfs_set_inode_uid(leaf, item, inode->i_uid); 2634 btrfs_set_inode_uid(leaf, item, inode->i_uid);
2518 btrfs_set_inode_gid(leaf, item, inode->i_gid); 2635 btrfs_set_inode_gid(leaf, item, inode->i_gid);
2519 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); 2636 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2541,7 +2658,12 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2541 btrfs_set_inode_transid(leaf, item, trans->transid); 2658 btrfs_set_inode_transid(leaf, item, trans->transid);
2542 btrfs_set_inode_rdev(leaf, item, inode->i_rdev); 2659 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
2543 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); 2660 btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
2544 btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); 2661 btrfs_set_inode_block_group(leaf, item, 0);
2662
2663 if (leaf->map_token) {
2664 unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
2665 leaf->map_token = NULL;
2666 }
2545} 2667}
2546 2668
2547/* 2669/*
@@ -2555,11 +2677,28 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2555 struct extent_buffer *leaf; 2677 struct extent_buffer *leaf;
2556 int ret; 2678 int ret;
2557 2679
2680 /*
2681 * If the inode is a free space inode, we can deadlock during commit
2682 * if we put it into the delayed code.
2683 *
2684 * The data relocation inode should also be directly updated
2685 * without delay
2686 */
2687 if (!is_free_space_inode(root, inode)
2688 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2689 ret = btrfs_delayed_update_inode(trans, root, inode);
2690 if (!ret)
2691 btrfs_set_inode_last_trans(trans, inode);
2692 return ret;
2693 }
2694
2558 path = btrfs_alloc_path(); 2695 path = btrfs_alloc_path();
2559 BUG_ON(!path); 2696 if (!path)
2697 return -ENOMEM;
2698
2560 path->leave_spinning = 1; 2699 path->leave_spinning = 1;
2561 ret = btrfs_lookup_inode(trans, root, path, 2700 ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
2562 &BTRFS_I(inode)->location, 1); 2701 1);
2563 if (ret) { 2702 if (ret) {
2564 if (ret > 0) 2703 if (ret > 0)
2565 ret = -ENOENT; 2704 ret = -ENOENT;
@@ -2569,7 +2708,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2569 btrfs_unlock_up_safe(path, 1); 2708 btrfs_unlock_up_safe(path, 1);
2570 leaf = path->nodes[0]; 2709 leaf = path->nodes[0];
2571 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2710 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2572 struct btrfs_inode_item); 2711 struct btrfs_inode_item);
2573 2712
2574 fill_inode_item(trans, leaf, inode_item, inode); 2713 fill_inode_item(trans, leaf, inode_item, inode);
2575 btrfs_mark_buffer_dirty(leaf); 2714 btrfs_mark_buffer_dirty(leaf);
@@ -2580,16 +2719,15 @@ failed:
2580 return ret; 2719 return ret;
2581} 2720}
2582 2721
2583
2584/* 2722/*
2585 * unlink helper that gets used here in inode.c and in the tree logging 2723 * unlink helper that gets used here in inode.c and in the tree logging
2586 * recovery code. It remove a link in a directory with a given name, and 2724 * recovery code. It remove a link in a directory with a given name, and
2587 * also drops the back refs in the inode to the directory 2725 * also drops the back refs in the inode to the directory
2588 */ 2726 */
2589int btrfs_unlink_inode(struct btrfs_trans_handle *trans, 2727static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2590 struct btrfs_root *root, 2728 struct btrfs_root *root,
2591 struct inode *dir, struct inode *inode, 2729 struct inode *dir, struct inode *inode,
2592 const char *name, int name_len) 2730 const char *name, int name_len)
2593{ 2731{
2594 struct btrfs_path *path; 2732 struct btrfs_path *path;
2595 int ret = 0; 2733 int ret = 0;
@@ -2597,15 +2735,17 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2597 struct btrfs_dir_item *di; 2735 struct btrfs_dir_item *di;
2598 struct btrfs_key key; 2736 struct btrfs_key key;
2599 u64 index; 2737 u64 index;
2738 u64 ino = btrfs_ino(inode);
2739 u64 dir_ino = btrfs_ino(dir);
2600 2740
2601 path = btrfs_alloc_path(); 2741 path = btrfs_alloc_path();
2602 if (!path) { 2742 if (!path) {
2603 ret = -ENOMEM; 2743 ret = -ENOMEM;
2604 goto err; 2744 goto out;
2605 } 2745 }
2606 2746
2607 path->leave_spinning = 1; 2747 path->leave_spinning = 1;
2608 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2748 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
2609 name, name_len, -1); 2749 name, name_len, -1);
2610 if (IS_ERR(di)) { 2750 if (IS_ERR(di)) {
2611 ret = PTR_ERR(di); 2751 ret = PTR_ERR(di);
@@ -2620,38 +2760,29 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2620 ret = btrfs_delete_one_dir_name(trans, root, path, di); 2760 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2621 if (ret) 2761 if (ret)
2622 goto err; 2762 goto err;
2623 btrfs_release_path(root, path); 2763 btrfs_release_path(path);
2624 2764
2625 ret = btrfs_del_inode_ref(trans, root, name, name_len, 2765 ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
2626 inode->i_ino, 2766 dir_ino, &index);
2627 dir->i_ino, &index);
2628 if (ret) { 2767 if (ret) {
2629 printk(KERN_INFO "btrfs failed to delete reference to %.*s, " 2768 printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
2630 "inode %lu parent %lu\n", name_len, name, 2769 "inode %llu parent %llu\n", name_len, name,
2631 inode->i_ino, dir->i_ino); 2770 (unsigned long long)ino, (unsigned long long)dir_ino);
2632 goto err; 2771 goto err;
2633 } 2772 }
2634 2773
2635 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 2774 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
2636 index, name, name_len, -1); 2775 if (ret)
2637 if (IS_ERR(di)) {
2638 ret = PTR_ERR(di);
2639 goto err;
2640 }
2641 if (!di) {
2642 ret = -ENOENT;
2643 goto err; 2776 goto err;
2644 }
2645 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2646 btrfs_release_path(root, path);
2647 2777
2648 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, 2778 ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
2649 inode, dir->i_ino); 2779 inode, dir_ino);
2650 BUG_ON(ret != 0 && ret != -ENOENT); 2780 BUG_ON(ret != 0 && ret != -ENOENT);
2651 2781
2652 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, 2782 ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
2653 dir, index); 2783 dir, index);
2654 BUG_ON(ret); 2784 if (ret == -ENOENT)
2785 ret = 0;
2655err: 2786err:
2656 btrfs_free_path(path); 2787 btrfs_free_path(path);
2657 if (ret) 2788 if (ret)
@@ -2660,22 +2791,36 @@ err:
2660 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 2791 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2661 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; 2792 inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
2662 btrfs_update_inode(trans, root, dir); 2793 btrfs_update_inode(trans, root, dir);
2663 btrfs_drop_nlink(inode);
2664 ret = btrfs_update_inode(trans, root, inode);
2665out: 2794out:
2666 return ret; 2795 return ret;
2667} 2796}
2668 2797
2798int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
2799 struct btrfs_root *root,
2800 struct inode *dir, struct inode *inode,
2801 const char *name, int name_len)
2802{
2803 int ret;
2804 ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
2805 if (!ret) {
2806 btrfs_drop_nlink(inode);
2807 ret = btrfs_update_inode(trans, root, inode);
2808 }
2809 return ret;
2810}
2811
2812
2669/* helper to check if there is any shared block in the path */ 2813/* helper to check if there is any shared block in the path */
2670static int check_path_shared(struct btrfs_root *root, 2814static int check_path_shared(struct btrfs_root *root,
2671 struct btrfs_path *path) 2815 struct btrfs_path *path)
2672{ 2816{
2673 struct extent_buffer *eb; 2817 struct extent_buffer *eb;
2674 int level; 2818 int level;
2675 int ret;
2676 u64 refs = 1; 2819 u64 refs = 1;
2677 2820
2678 for (level = 0; level < BTRFS_MAX_LEVEL; level++) { 2821 for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2822 int ret;
2823
2679 if (!path->nodes[level]) 2824 if (!path->nodes[level])
2680 break; 2825 break;
2681 eb = path->nodes[level]; 2826 eb = path->nodes[level];
@@ -2709,12 +2854,14 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2709 int check_link = 1; 2854 int check_link = 1;
2710 int err = -ENOSPC; 2855 int err = -ENOSPC;
2711 int ret; 2856 int ret;
2857 u64 ino = btrfs_ino(inode);
2858 u64 dir_ino = btrfs_ino(dir);
2712 2859
2713 trans = btrfs_start_transaction(root, 10); 2860 trans = btrfs_start_transaction(root, 10);
2714 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2861 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2715 return trans; 2862 return trans;
2716 2863
2717 if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 2864 if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
2718 return ERR_PTR(-ENOSPC); 2865 return ERR_PTR(-ENOSPC);
2719 2866
2720 /* check if there is someone else holds reference */ 2867 /* check if there is someone else holds reference */
@@ -2755,7 +2902,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2755 } else { 2902 } else {
2756 check_link = 0; 2903 check_link = 0;
2757 } 2904 }
2758 btrfs_release_path(root, path); 2905 btrfs_release_path(path);
2759 2906
2760 ret = btrfs_lookup_inode(trans, root, path, 2907 ret = btrfs_lookup_inode(trans, root, path,
2761 &BTRFS_I(inode)->location, 0); 2908 &BTRFS_I(inode)->location, 0);
@@ -2769,11 +2916,11 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2769 } else { 2916 } else {
2770 check_link = 0; 2917 check_link = 0;
2771 } 2918 }
2772 btrfs_release_path(root, path); 2919 btrfs_release_path(path);
2773 2920
2774 if (ret == 0 && S_ISREG(inode->i_mode)) { 2921 if (ret == 0 && S_ISREG(inode->i_mode)) {
2775 ret = btrfs_lookup_file_extent(trans, root, path, 2922 ret = btrfs_lookup_file_extent(trans, root, path,
2776 inode->i_ino, (u64)-1, 0); 2923 ino, (u64)-1, 0);
2777 if (ret < 0) { 2924 if (ret < 0) {
2778 err = ret; 2925 err = ret;
2779 goto out; 2926 goto out;
@@ -2781,7 +2928,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2781 BUG_ON(ret == 0); 2928 BUG_ON(ret == 0);
2782 if (check_path_shared(root, path)) 2929 if (check_path_shared(root, path))
2783 goto out; 2930 goto out;
2784 btrfs_release_path(root, path); 2931 btrfs_release_path(path);
2785 } 2932 }
2786 2933
2787 if (!check_link) { 2934 if (!check_link) {
@@ -2789,7 +2936,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2789 goto out; 2936 goto out;
2790 } 2937 }
2791 2938
2792 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 2939 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
2793 dentry->d_name.name, dentry->d_name.len, 0); 2940 dentry->d_name.name, dentry->d_name.len, 0);
2794 if (IS_ERR(di)) { 2941 if (IS_ERR(di)) {
2795 err = PTR_ERR(di); 2942 err = PTR_ERR(di);
@@ -2802,11 +2949,11 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2802 err = 0; 2949 err = 0;
2803 goto out; 2950 goto out;
2804 } 2951 }
2805 btrfs_release_path(root, path); 2952 btrfs_release_path(path);
2806 2953
2807 ref = btrfs_lookup_inode_ref(trans, root, path, 2954 ref = btrfs_lookup_inode_ref(trans, root, path,
2808 dentry->d_name.name, dentry->d_name.len, 2955 dentry->d_name.name, dentry->d_name.len,
2809 inode->i_ino, dir->i_ino, 0); 2956 ino, dir_ino, 0);
2810 if (IS_ERR(ref)) { 2957 if (IS_ERR(ref)) {
2811 err = PTR_ERR(ref); 2958 err = PTR_ERR(ref);
2812 goto out; 2959 goto out;
@@ -2815,9 +2962,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2815 if (check_path_shared(root, path)) 2962 if (check_path_shared(root, path))
2816 goto out; 2963 goto out;
2817 index = btrfs_inode_ref_index(path->nodes[0], ref); 2964 index = btrfs_inode_ref_index(path->nodes[0], ref);
2818 btrfs_release_path(root, path); 2965 btrfs_release_path(path);
2819 2966
2820 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index, 2967 /*
2968 * This is a commit root search, if we can lookup inode item and other
2969 * relative items in the commit root, it means the transaction of
2970 * dir/file creation has been committed, and the dir index item that we
2971 * delay to insert has also been inserted into the commit root. So
2972 * we needn't worry about the delayed insertion of the dir index item
2973 * here.
2974 */
2975 di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
2821 dentry->d_name.name, dentry->d_name.len, 0); 2976 dentry->d_name.name, dentry->d_name.len, 0);
2822 if (IS_ERR(di)) { 2977 if (IS_ERR(di)) {
2823 err = PTR_ERR(di); 2978 err = PTR_ERR(di);
@@ -2862,8 +3017,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
2862 if (IS_ERR(trans)) 3017 if (IS_ERR(trans))
2863 return PTR_ERR(trans); 3018 return PTR_ERR(trans);
2864 3019
2865 btrfs_set_trans_block_group(trans, dir);
2866
2867 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0); 3020 btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
2868 3021
2869 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, 3022 ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
@@ -2892,47 +3045,41 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
2892 struct btrfs_key key; 3045 struct btrfs_key key;
2893 u64 index; 3046 u64 index;
2894 int ret; 3047 int ret;
3048 u64 dir_ino = btrfs_ino(dir);
2895 3049
2896 path = btrfs_alloc_path(); 3050 path = btrfs_alloc_path();
2897 if (!path) 3051 if (!path)
2898 return -ENOMEM; 3052 return -ENOMEM;
2899 3053
2900 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, 3054 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
2901 name, name_len, -1); 3055 name, name_len, -1);
2902 BUG_ON(!di || IS_ERR(di)); 3056 BUG_ON(IS_ERR_OR_NULL(di));
2903 3057
2904 leaf = path->nodes[0]; 3058 leaf = path->nodes[0];
2905 btrfs_dir_item_key_to_cpu(leaf, di, &key); 3059 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2906 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid); 3060 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2907 ret = btrfs_delete_one_dir_name(trans, root, path, di); 3061 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2908 BUG_ON(ret); 3062 BUG_ON(ret);
2909 btrfs_release_path(root, path); 3063 btrfs_release_path(path);
2910 3064
2911 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root, 3065 ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
2912 objectid, root->root_key.objectid, 3066 objectid, root->root_key.objectid,
2913 dir->i_ino, &index, name, name_len); 3067 dir_ino, &index, name, name_len);
2914 if (ret < 0) { 3068 if (ret < 0) {
2915 BUG_ON(ret != -ENOENT); 3069 BUG_ON(ret != -ENOENT);
2916 di = btrfs_search_dir_index_item(root, path, dir->i_ino, 3070 di = btrfs_search_dir_index_item(root, path, dir_ino,
2917 name, name_len); 3071 name, name_len);
2918 BUG_ON(!di || IS_ERR(di)); 3072 BUG_ON(IS_ERR_OR_NULL(di));
2919 3073
2920 leaf = path->nodes[0]; 3074 leaf = path->nodes[0];
2921 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 3075 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2922 btrfs_release_path(root, path); 3076 btrfs_release_path(path);
2923 index = key.offset; 3077 index = key.offset;
2924 } 3078 }
3079 btrfs_release_path(path);
2925 3080
2926 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 3081 ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
2927 index, name, name_len, -1);
2928 BUG_ON(!di || IS_ERR(di));
2929
2930 leaf = path->nodes[0];
2931 btrfs_dir_item_key_to_cpu(leaf, di, &key);
2932 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
2933 ret = btrfs_delete_one_dir_name(trans, root, path, di);
2934 BUG_ON(ret); 3082 BUG_ON(ret);
2935 btrfs_release_path(root, path);
2936 3083
2937 btrfs_i_size_write(dir, dir->i_size - name_len * 2); 3084 btrfs_i_size_write(dir, dir->i_size - name_len * 2);
2938 dir->i_mtime = dir->i_ctime = CURRENT_TIME; 3085 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -2952,16 +3099,14 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
2952 unsigned long nr = 0; 3099 unsigned long nr = 0;
2953 3100
2954 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || 3101 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
2955 inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 3102 btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
2956 return -ENOTEMPTY; 3103 return -ENOTEMPTY;
2957 3104
2958 trans = __unlink_start_trans(dir, dentry); 3105 trans = __unlink_start_trans(dir, dentry);
2959 if (IS_ERR(trans)) 3106 if (IS_ERR(trans))
2960 return PTR_ERR(trans); 3107 return PTR_ERR(trans);
2961 3108
2962 btrfs_set_trans_block_group(trans, dir); 3109 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
2963
2964 if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
2965 err = btrfs_unlink_subvol(trans, root, dir, 3110 err = btrfs_unlink_subvol(trans, root, dir,
2966 BTRFS_I(inode)->location.objectid, 3111 BTRFS_I(inode)->location.objectid,
2967 dentry->d_name.name, 3112 dentry->d_name.name,
@@ -2986,178 +3131,6 @@ out:
2986 return err; 3131 return err;
2987} 3132}
2988 3133
2989#if 0
2990/*
2991 * when truncating bytes in a file, it is possible to avoid reading
2992 * the leaves that contain only checksum items. This can be the
2993 * majority of the IO required to delete a large file, but it must
2994 * be done carefully.
2995 *
2996 * The keys in the level just above the leaves are checked to make sure
2997 * the lowest key in a given leaf is a csum key, and starts at an offset
2998 * after the new size.
2999 *
3000 * Then the key for the next leaf is checked to make sure it also has
3001 * a checksum item for the same file. If it does, we know our target leaf
3002 * contains only checksum items, and it can be safely freed without reading
3003 * it.
3004 *
3005 * This is just an optimization targeted at large files. It may do
3006 * nothing. It will return 0 unless things went badly.
3007 */
3008static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
3009 struct btrfs_root *root,
3010 struct btrfs_path *path,
3011 struct inode *inode, u64 new_size)
3012{
3013 struct btrfs_key key;
3014 int ret;
3015 int nritems;
3016 struct btrfs_key found_key;
3017 struct btrfs_key other_key;
3018 struct btrfs_leaf_ref *ref;
3019 u64 leaf_gen;
3020 u64 leaf_start;
3021
3022 path->lowest_level = 1;
3023 key.objectid = inode->i_ino;
3024 key.type = BTRFS_CSUM_ITEM_KEY;
3025 key.offset = new_size;
3026again:
3027 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3028 if (ret < 0)
3029 goto out;
3030
3031 if (path->nodes[1] == NULL) {
3032 ret = 0;
3033 goto out;
3034 }
3035 ret = 0;
3036 btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
3037 nritems = btrfs_header_nritems(path->nodes[1]);
3038
3039 if (!nritems)
3040 goto out;
3041
3042 if (path->slots[1] >= nritems)
3043 goto next_node;
3044
3045 /* did we find a key greater than anything we want to delete? */
3046 if (found_key.objectid > inode->i_ino ||
3047 (found_key.objectid == inode->i_ino && found_key.type > key.type))
3048 goto out;
3049
3050 /* we check the next key in the node to make sure the leave contains
3051 * only checksum items. This comparison doesn't work if our
3052 * leaf is the last one in the node
3053 */
3054 if (path->slots[1] + 1 >= nritems) {
3055next_node:
3056 /* search forward from the last key in the node, this
3057 * will bring us into the next node in the tree
3058 */
3059 btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
3060
3061 /* unlikely, but we inc below, so check to be safe */
3062 if (found_key.offset == (u64)-1)
3063 goto out;
3064
3065 /* search_forward needs a path with locks held, do the
3066 * search again for the original key. It is possible
3067 * this will race with a balance and return a path that
3068 * we could modify, but this drop is just an optimization
3069 * and is allowed to miss some leaves.
3070 */
3071 btrfs_release_path(root, path);
3072 found_key.offset++;
3073
3074 /* setup a max key for search_forward */
3075 other_key.offset = (u64)-1;
3076 other_key.type = key.type;
3077 other_key.objectid = key.objectid;
3078
3079 path->keep_locks = 1;
3080 ret = btrfs_search_forward(root, &found_key, &other_key,
3081 path, 0, 0);
3082 path->keep_locks = 0;
3083 if (ret || found_key.objectid != key.objectid ||
3084 found_key.type != key.type) {
3085 ret = 0;
3086 goto out;
3087 }
3088
3089 key.offset = found_key.offset;
3090 btrfs_release_path(root, path);
3091 cond_resched();
3092 goto again;
3093 }
3094
3095 /* we know there's one more slot after us in the tree,
3096 * read that key so we can verify it is also a checksum item
3097 */
3098 btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
3099
3100 if (found_key.objectid < inode->i_ino)
3101 goto next_key;
3102
3103 if (found_key.type != key.type || found_key.offset < new_size)
3104 goto next_key;
3105
3106 /*
3107 * if the key for the next leaf isn't a csum key from this objectid,
3108 * we can't be sure there aren't good items inside this leaf.
3109 * Bail out
3110 */
3111 if (other_key.objectid != inode->i_ino || other_key.type != key.type)
3112 goto out;
3113
3114 leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
3115 leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
3116 /*
3117 * it is safe to delete this leaf, it contains only
3118 * csum items from this inode at an offset >= new_size
3119 */
3120 ret = btrfs_del_leaf(trans, root, path, leaf_start);
3121 BUG_ON(ret);
3122
3123 if (root->ref_cows && leaf_gen < trans->transid) {
3124 ref = btrfs_alloc_leaf_ref(root, 0);
3125 if (ref) {
3126 ref->root_gen = root->root_key.offset;
3127 ref->bytenr = leaf_start;
3128 ref->owner = 0;
3129 ref->generation = leaf_gen;
3130 ref->nritems = 0;
3131
3132 btrfs_sort_leaf_ref(ref);
3133
3134 ret = btrfs_add_leaf_ref(root, ref, 0);
3135 WARN_ON(ret);
3136 btrfs_free_leaf_ref(root, ref);
3137 } else {
3138 WARN_ON(1);
3139 }
3140 }
3141next_key:
3142 btrfs_release_path(root, path);
3143
3144 if (other_key.objectid == inode->i_ino &&
3145 other_key.type == key.type && other_key.offset > key.offset) {
3146 key.offset = other_key.offset;
3147 cond_resched();
3148 goto again;
3149 }
3150 ret = 0;
3151out:
3152 /* fixup any changes we've made to the path */
3153 path->lowest_level = 0;
3154 path->keep_locks = 0;
3155 btrfs_release_path(root, path);
3156 return ret;
3157}
3158
3159#endif
3160
3161/* 3134/*
3162 * this can truncate away extent items, csum items and directory items. 3135 * this can truncate away extent items, csum items and directory items.
3163 * It starts at a high offset and removes keys until it can't find 3136 * It starts at a high offset and removes keys until it can't find
@@ -3193,17 +3166,27 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3193 int encoding; 3166 int encoding;
3194 int ret; 3167 int ret;
3195 int err = 0; 3168 int err = 0;
3169 u64 ino = btrfs_ino(inode);
3196 3170
3197 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); 3171 BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3198 3172
3199 if (root->ref_cows) 3173 if (root->ref_cows || root == root->fs_info->tree_root)
3200 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); 3174 btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
3201 3175
3176 /*
3177 * This function is also used to drop the items in the log tree before
3178 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
3179 * it is used to drop the loged items. So we shouldn't kill the delayed
3180 * items.
3181 */
3182 if (min_type == 0 && root == BTRFS_I(inode)->root)
3183 btrfs_kill_delayed_inode_items(inode);
3184
3202 path = btrfs_alloc_path(); 3185 path = btrfs_alloc_path();
3203 BUG_ON(!path); 3186 BUG_ON(!path);
3204 path->reada = -1; 3187 path->reada = -1;
3205 3188
3206 key.objectid = inode->i_ino; 3189 key.objectid = ino;
3207 key.offset = (u64)-1; 3190 key.offset = (u64)-1;
3208 key.type = (u8)-1; 3191 key.type = (u8)-1;
3209 3192
@@ -3231,7 +3214,7 @@ search_again:
3231 found_type = btrfs_key_type(&found_key); 3214 found_type = btrfs_key_type(&found_key);
3232 encoding = 0; 3215 encoding = 0;
3233 3216
3234 if (found_key.objectid != inode->i_ino) 3217 if (found_key.objectid != ino)
3235 break; 3218 break;
3236 3219
3237 if (found_type < min_type) 3220 if (found_type < min_type)
@@ -3321,7 +3304,6 @@ search_again:
3321 btrfs_file_extent_calc_inline_size(size); 3304 btrfs_file_extent_calc_inline_size(size);
3322 ret = btrfs_truncate_item(trans, root, path, 3305 ret = btrfs_truncate_item(trans, root, path,
3323 size, 1); 3306 size, 1);
3324 BUG_ON(ret);
3325 } else if (root->ref_cows) { 3307 } else if (root->ref_cows) {
3326 inode_sub_bytes(inode, item_end + 1 - 3308 inode_sub_bytes(inode, item_end + 1 -
3327 found_key.offset); 3309 found_key.offset);
@@ -3344,12 +3326,13 @@ delete:
3344 } else { 3326 } else {
3345 break; 3327 break;
3346 } 3328 }
3347 if (found_extent && root->ref_cows) { 3329 if (found_extent && (root->ref_cows ||
3330 root == root->fs_info->tree_root)) {
3348 btrfs_set_path_blocking(path); 3331 btrfs_set_path_blocking(path);
3349 ret = btrfs_free_extent(trans, root, extent_start, 3332 ret = btrfs_free_extent(trans, root, extent_start,
3350 extent_num_bytes, 0, 3333 extent_num_bytes, 0,
3351 btrfs_header_owner(leaf), 3334 btrfs_header_owner(leaf),
3352 inode->i_ino, extent_offset); 3335 ino, extent_offset);
3353 BUG_ON(ret); 3336 BUG_ON(ret);
3354 } 3337 }
3355 3338
@@ -3358,7 +3341,9 @@ delete:
3358 3341
3359 if (path->slots[0] == 0 || 3342 if (path->slots[0] == 0 ||
3360 path->slots[0] != pending_del_slot) { 3343 path->slots[0] != pending_del_slot) {
3361 if (root->ref_cows) { 3344 if (root->ref_cows &&
3345 BTRFS_I(inode)->location.objectid !=
3346 BTRFS_FREE_INO_OBJECTID) {
3362 err = -EAGAIN; 3347 err = -EAGAIN;
3363 goto out; 3348 goto out;
3364 } 3349 }
@@ -3369,7 +3354,7 @@ delete:
3369 BUG_ON(ret); 3354 BUG_ON(ret);
3370 pending_del_nr = 0; 3355 pending_del_nr = 0;
3371 } 3356 }
3372 btrfs_release_path(root, path); 3357 btrfs_release_path(path);
3373 goto search_again; 3358 goto search_again;
3374 } else { 3359 } else {
3375 path->slots[0]--; 3360 path->slots[0]--;
@@ -3485,7 +3470,13 @@ out:
3485 return ret; 3470 return ret;
3486} 3471}
3487 3472
3488int btrfs_cont_expand(struct inode *inode, loff_t size) 3473/*
3474 * This function puts in dummy file extents for the area we're creating a hole
3475 * for. So if we are truncating this file to a larger size we need to insert
3476 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
3477 * the range between oldsize and size
3478 */
3479int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
3489{ 3480{
3490 struct btrfs_trans_handle *trans; 3481 struct btrfs_trans_handle *trans;
3491 struct btrfs_root *root = BTRFS_I(inode)->root; 3482 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3493,7 +3484,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3493 struct extent_map *em = NULL; 3484 struct extent_map *em = NULL;
3494 struct extent_state *cached_state = NULL; 3485 struct extent_state *cached_state = NULL;
3495 u64 mask = root->sectorsize - 1; 3486 u64 mask = root->sectorsize - 1;
3496 u64 hole_start = (inode->i_size + mask) & ~mask; 3487 u64 hole_start = (oldsize + mask) & ~mask;
3497 u64 block_end = (size + mask) & ~mask; 3488 u64 block_end = (size + mask) & ~mask;
3498 u64 last_byte; 3489 u64 last_byte;
3499 u64 cur_offset; 3490 u64 cur_offset;
@@ -3521,7 +3512,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3521 while (1) { 3512 while (1) {
3522 em = btrfs_get_extent(inode, NULL, 0, cur_offset, 3513 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
3523 block_end - cur_offset, 0); 3514 block_end - cur_offset, 0);
3524 BUG_ON(IS_ERR(em) || !em); 3515 BUG_ON(IS_ERR_OR_NULL(em));
3525 last_byte = min(extent_map_end(em), block_end); 3516 last_byte = min(extent_map_end(em), block_end);
3526 last_byte = (last_byte + mask) & ~mask; 3517 last_byte = (last_byte + mask) & ~mask;
3527 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { 3518 if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3533,18 +3524,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3533 err = PTR_ERR(trans); 3524 err = PTR_ERR(trans);
3534 break; 3525 break;
3535 } 3526 }
3536 btrfs_set_trans_block_group(trans, inode);
3537 3527
3538 err = btrfs_drop_extents(trans, inode, cur_offset, 3528 err = btrfs_drop_extents(trans, inode, cur_offset,
3539 cur_offset + hole_size, 3529 cur_offset + hole_size,
3540 &hint_byte, 1); 3530 &hint_byte, 1);
3541 BUG_ON(err); 3531 if (err)
3532 break;
3542 3533
3543 err = btrfs_insert_file_extent(trans, root, 3534 err = btrfs_insert_file_extent(trans, root,
3544 inode->i_ino, cur_offset, 0, 3535 btrfs_ino(inode), cur_offset, 0,
3545 0, hole_size, 0, hole_size, 3536 0, hole_size, 0, hole_size,
3546 0, 0, 0); 3537 0, 0, 0);
3547 BUG_ON(err); 3538 if (err)
3539 break;
3548 3540
3549 btrfs_drop_extent_cache(inode, hole_start, 3541 btrfs_drop_extent_cache(inode, hole_start,
3550 last_byte - 1, 0); 3542 last_byte - 1, 0);
@@ -3564,94 +3556,58 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3564 return err; 3556 return err;
3565} 3557}
3566 3558
3567static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) 3559static int btrfs_setsize(struct inode *inode, loff_t newsize)
3568{ 3560{
3569 struct btrfs_root *root = BTRFS_I(inode)->root; 3561 loff_t oldsize = i_size_read(inode);
3570 struct btrfs_trans_handle *trans;
3571 unsigned long nr;
3572 int ret; 3562 int ret;
3573 3563
3574 if (attr->ia_size == inode->i_size) 3564 if (newsize == oldsize)
3575 return 0; 3565 return 0;
3576 3566
3577 if (attr->ia_size > inode->i_size) { 3567 if (newsize > oldsize) {
3578 unsigned long limit; 3568 i_size_write(inode, newsize);
3579 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 3569 btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
3580 if (attr->ia_size > inode->i_sb->s_maxbytes) 3570 truncate_pagecache(inode, oldsize, newsize);
3581 return -EFBIG; 3571 ret = btrfs_cont_expand(inode, oldsize, newsize);
3582 if (limit != RLIM_INFINITY && attr->ia_size > limit) {
3583 send_sig(SIGXFSZ, current, 0);
3584 return -EFBIG;
3585 }
3586 }
3587
3588 trans = btrfs_start_transaction(root, 5);
3589 if (IS_ERR(trans))
3590 return PTR_ERR(trans);
3591
3592 btrfs_set_trans_block_group(trans, inode);
3593
3594 ret = btrfs_orphan_add(trans, inode);
3595 BUG_ON(ret);
3596
3597 nr = trans->blocks_used;
3598 btrfs_end_transaction(trans, root);
3599 btrfs_btree_balance_dirty(root, nr);
3600
3601 if (attr->ia_size > inode->i_size) {
3602 ret = btrfs_cont_expand(inode, attr->ia_size);
3603 if (ret) { 3572 if (ret) {
3604 btrfs_truncate(inode); 3573 btrfs_setsize(inode, oldsize);
3605 return ret; 3574 return ret;
3606 } 3575 }
3607 3576
3608 i_size_write(inode, attr->ia_size); 3577 mark_inode_dirty(inode);
3609 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 3578 } else {
3610 3579
3611 trans = btrfs_start_transaction(root, 0); 3580 /*
3612 BUG_ON(IS_ERR(trans)); 3581 * We're truncating a file that used to have good data down to
3613 btrfs_set_trans_block_group(trans, inode); 3582 * zero. Make sure it gets into the ordered flush list so that
3614 trans->block_rsv = root->orphan_block_rsv; 3583 * any new writes get down to disk quickly.
3615 BUG_ON(!trans->block_rsv); 3584 */
3585 if (newsize == 0)
3586 BTRFS_I(inode)->ordered_data_close = 1;
3616 3587
3617 ret = btrfs_update_inode(trans, root, inode); 3588 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3618 BUG_ON(ret); 3589 truncate_setsize(inode, newsize);
3619 if (inode->i_nlink > 0) { 3590 ret = btrfs_truncate(inode);
3620 ret = btrfs_orphan_del(trans, inode);
3621 BUG_ON(ret);
3622 }
3623 nr = trans->blocks_used;
3624 btrfs_end_transaction(trans, root);
3625 btrfs_btree_balance_dirty(root, nr);
3626 return 0;
3627 } 3591 }
3628 3592
3629 /* 3593 return ret;
3630 * We're truncating a file that used to have good data down to
3631 * zero. Make sure it gets into the ordered flush list so that
3632 * any new writes get down to disk quickly.
3633 */
3634 if (attr->ia_size == 0)
3635 BTRFS_I(inode)->ordered_data_close = 1;
3636
3637 /* we don't support swapfiles, so vmtruncate shouldn't fail */
3638 ret = vmtruncate(inode, attr->ia_size);
3639 BUG_ON(ret);
3640
3641 return 0;
3642} 3594}
3643 3595
3644static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) 3596static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
3645{ 3597{
3646 struct inode *inode = dentry->d_inode; 3598 struct inode *inode = dentry->d_inode;
3599 struct btrfs_root *root = BTRFS_I(inode)->root;
3647 int err; 3600 int err;
3648 3601
3602 if (btrfs_root_readonly(root))
3603 return -EROFS;
3604
3649 err = inode_change_ok(inode, attr); 3605 err = inode_change_ok(inode, attr);
3650 if (err) 3606 if (err)
3651 return err; 3607 return err;
3652 3608
3653 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { 3609 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
3654 err = btrfs_setattr_size(inode, attr); 3610 err = btrfs_setsize(inode, attr->ia_size);
3655 if (err) 3611 if (err)
3656 return err; 3612 return err;
3657 } 3613 }
@@ -3674,8 +3630,11 @@ void btrfs_evict_inode(struct inode *inode)
3674 unsigned long nr; 3630 unsigned long nr;
3675 int ret; 3631 int ret;
3676 3632
3633 trace_btrfs_inode_evict(inode);
3634
3677 truncate_inode_pages(&inode->i_data, 0); 3635 truncate_inode_pages(&inode->i_data, 0);
3678 if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0) 3636 if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
3637 is_free_space_inode(root, inode)))
3679 goto no_delete; 3638 goto no_delete;
3680 3639
3681 if (is_bad_inode(inode)) { 3640 if (is_bad_inode(inode)) {
@@ -3698,9 +3657,8 @@ void btrfs_evict_inode(struct inode *inode)
3698 btrfs_i_size_write(inode, 0); 3657 btrfs_i_size_write(inode, 0);
3699 3658
3700 while (1) { 3659 while (1) {
3701 trans = btrfs_start_transaction(root, 0); 3660 trans = btrfs_join_transaction(root);
3702 BUG_ON(IS_ERR(trans)); 3661 BUG_ON(IS_ERR(trans));
3703 btrfs_set_trans_block_group(trans, inode);
3704 trans->block_rsv = root->orphan_block_rsv; 3662 trans->block_rsv = root->orphan_block_rsv;
3705 3663
3706 ret = btrfs_block_rsv_check(trans, root, 3664 ret = btrfs_block_rsv_check(trans, root,
@@ -3728,6 +3686,10 @@ void btrfs_evict_inode(struct inode *inode)
3728 BUG_ON(ret); 3686 BUG_ON(ret);
3729 } 3687 }
3730 3688
3689 if (!(root == root->fs_info->tree_root ||
3690 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3691 btrfs_return_ino(root, btrfs_ino(inode));
3692
3731 nr = trans->blocks_used; 3693 nr = trans->blocks_used;
3732 btrfs_end_transaction(trans, root); 3694 btrfs_end_transaction(trans, root);
3733 btrfs_btree_balance_dirty(root, nr); 3695 btrfs_btree_balance_dirty(root, nr);
@@ -3753,12 +3715,12 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
3753 path = btrfs_alloc_path(); 3715 path = btrfs_alloc_path();
3754 BUG_ON(!path); 3716 BUG_ON(!path);
3755 3717
3756 di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, 3718 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
3757 namelen, 0); 3719 namelen, 0);
3758 if (IS_ERR(di)) 3720 if (IS_ERR(di))
3759 ret = PTR_ERR(di); 3721 ret = PTR_ERR(di);
3760 3722
3761 if (!di || IS_ERR(di)) 3723 if (IS_ERR_OR_NULL(di))
3762 goto out_err; 3724 goto out_err;
3763 3725
3764 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); 3726 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
@@ -3806,7 +3768,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
3806 3768
3807 leaf = path->nodes[0]; 3769 leaf = path->nodes[0];
3808 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); 3770 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
3809 if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino || 3771 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
3810 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) 3772 btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
3811 goto out; 3773 goto out;
3812 3774
@@ -3816,7 +3778,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
3816 if (ret) 3778 if (ret)
3817 goto out; 3779 goto out;
3818 3780
3819 btrfs_release_path(root->fs_info->tree_root, path); 3781 btrfs_release_path(path);
3820 3782
3821 new_root = btrfs_read_fs_root_no_name(root->fs_info, location); 3783 new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
3822 if (IS_ERR(new_root)) { 3784 if (IS_ERR(new_root)) {
@@ -3845,11 +3807,12 @@ static void inode_tree_add(struct inode *inode)
3845 struct btrfs_inode *entry; 3807 struct btrfs_inode *entry;
3846 struct rb_node **p; 3808 struct rb_node **p;
3847 struct rb_node *parent; 3809 struct rb_node *parent;
3810 u64 ino = btrfs_ino(inode);
3848again: 3811again:
3849 p = &root->inode_tree.rb_node; 3812 p = &root->inode_tree.rb_node;
3850 parent = NULL; 3813 parent = NULL;
3851 3814
3852 if (hlist_unhashed(&inode->i_hash)) 3815 if (inode_unhashed(inode))
3853 return; 3816 return;
3854 3817
3855 spin_lock(&root->inode_lock); 3818 spin_lock(&root->inode_lock);
@@ -3857,9 +3820,9 @@ again:
3857 parent = *p; 3820 parent = *p;
3858 entry = rb_entry(parent, struct btrfs_inode, rb_node); 3821 entry = rb_entry(parent, struct btrfs_inode, rb_node);
3859 3822
3860 if (inode->i_ino < entry->vfs_inode.i_ino) 3823 if (ino < btrfs_ino(&entry->vfs_inode))
3861 p = &parent->rb_left; 3824 p = &parent->rb_left;
3862 else if (inode->i_ino > entry->vfs_inode.i_ino) 3825 else if (ino > btrfs_ino(&entry->vfs_inode))
3863 p = &parent->rb_right; 3826 p = &parent->rb_right;
3864 else { 3827 else {
3865 WARN_ON(!(entry->vfs_inode.i_state & 3828 WARN_ON(!(entry->vfs_inode.i_state &
@@ -3888,7 +3851,14 @@ static void inode_tree_del(struct inode *inode)
3888 } 3851 }
3889 spin_unlock(&root->inode_lock); 3852 spin_unlock(&root->inode_lock);
3890 3853
3891 if (empty && btrfs_root_refs(&root->root_item) == 0) { 3854 /*
3855 * Free space cache has inodes in the tree root, but the tree root has a
3856 * root_refs of 0, so this could end up dropping the tree root as a
3857 * snapshot, so we need the extra !root->fs_info->tree_root check to
3858 * make sure we don't drop it.
3859 */
3860 if (empty && btrfs_root_refs(&root->root_item) == 0 &&
3861 root != root->fs_info->tree_root) {
3892 synchronize_srcu(&root->fs_info->subvol_srcu); 3862 synchronize_srcu(&root->fs_info->subvol_srcu);
3893 spin_lock(&root->inode_lock); 3863 spin_lock(&root->inode_lock);
3894 empty = RB_EMPTY_ROOT(&root->inode_tree); 3864 empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -3916,9 +3886,9 @@ again:
3916 prev = node; 3886 prev = node;
3917 entry = rb_entry(node, struct btrfs_inode, rb_node); 3887 entry = rb_entry(node, struct btrfs_inode, rb_node);
3918 3888
3919 if (objectid < entry->vfs_inode.i_ino) 3889 if (objectid < btrfs_ino(&entry->vfs_inode))
3920 node = node->rb_left; 3890 node = node->rb_left;
3921 else if (objectid > entry->vfs_inode.i_ino) 3891 else if (objectid > btrfs_ino(&entry->vfs_inode))
3922 node = node->rb_right; 3892 node = node->rb_right;
3923 else 3893 else
3924 break; 3894 break;
@@ -3926,7 +3896,7 @@ again:
3926 if (!node) { 3896 if (!node) {
3927 while (prev) { 3897 while (prev) {
3928 entry = rb_entry(prev, struct btrfs_inode, rb_node); 3898 entry = rb_entry(prev, struct btrfs_inode, rb_node);
3929 if (objectid <= entry->vfs_inode.i_ino) { 3899 if (objectid <= btrfs_ino(&entry->vfs_inode)) {
3930 node = prev; 3900 node = prev;
3931 break; 3901 break;
3932 } 3902 }
@@ -3935,7 +3905,7 @@ again:
3935 } 3905 }
3936 while (node) { 3906 while (node) {
3937 entry = rb_entry(node, struct btrfs_inode, rb_node); 3907 entry = rb_entry(node, struct btrfs_inode, rb_node);
3938 objectid = entry->vfs_inode.i_ino + 1; 3908 objectid = btrfs_ino(&entry->vfs_inode) + 1;
3939 inode = igrab(&entry->vfs_inode); 3909 inode = igrab(&entry->vfs_inode);
3940 if (inode) { 3910 if (inode) {
3941 spin_unlock(&root->inode_lock); 3911 spin_unlock(&root->inode_lock);
@@ -3973,7 +3943,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
3973static int btrfs_find_actor(struct inode *inode, void *opaque) 3943static int btrfs_find_actor(struct inode *inode, void *opaque)
3974{ 3944{
3975 struct btrfs_iget_args *args = opaque; 3945 struct btrfs_iget_args *args = opaque;
3976 return args->ino == inode->i_ino && 3946 return args->ino == btrfs_ino(inode) &&
3977 args->root == BTRFS_I(inode)->root; 3947 args->root == BTRFS_I(inode)->root;
3978} 3948}
3979 3949
@@ -4008,7 +3978,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4008 BTRFS_I(inode)->root = root; 3978 BTRFS_I(inode)->root = root;
4009 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); 3979 memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4010 btrfs_read_locked_inode(inode); 3980 btrfs_read_locked_inode(inode);
4011
4012 inode_tree_add(inode); 3981 inode_tree_add(inode);
4013 unlock_new_inode(inode); 3982 unlock_new_inode(inode);
4014 if (new) 3983 if (new)
@@ -4049,8 +4018,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4049 int index; 4018 int index;
4050 int ret; 4019 int ret;
4051 4020
4052 dentry->d_op = &btrfs_dentry_operations;
4053
4054 if (dentry->d_name.len > BTRFS_NAME_LEN) 4021 if (dentry->d_name.len > BTRFS_NAME_LEN)
4055 return ERR_PTR(-ENAMETOOLONG); 4022 return ERR_PTR(-ENAMETOOLONG);
4056 4023
@@ -4082,17 +4049,19 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4082 } 4049 }
4083 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 4050 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4084 4051
4085 if (root != sub_root) { 4052 if (!IS_ERR(inode) && root != sub_root) {
4086 down_read(&root->fs_info->cleanup_work_sem); 4053 down_read(&root->fs_info->cleanup_work_sem);
4087 if (!(inode->i_sb->s_flags & MS_RDONLY)) 4054 if (!(inode->i_sb->s_flags & MS_RDONLY))
4088 btrfs_orphan_cleanup(sub_root); 4055 ret = btrfs_orphan_cleanup(sub_root);
4089 up_read(&root->fs_info->cleanup_work_sem); 4056 up_read(&root->fs_info->cleanup_work_sem);
4057 if (ret)
4058 inode = ERR_PTR(ret);
4090 } 4059 }
4091 4060
4092 return inode; 4061 return inode;
4093} 4062}
4094 4063
4095static int btrfs_dentry_delete(struct dentry *dentry) 4064static int btrfs_dentry_delete(const struct dentry *dentry)
4096{ 4065{
4097 struct btrfs_root *root; 4066 struct btrfs_root *root;
4098 4067
@@ -4119,7 +4088,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4119 return d_splice_alias(inode, dentry); 4088 return d_splice_alias(inode, dentry);
4120} 4089}
4121 4090
4122static unsigned char btrfs_filetype_table[] = { 4091unsigned char btrfs_filetype_table[] = {
4123 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK 4092 DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
4124}; 4093};
4125 4094
@@ -4133,11 +4102,11 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4133 struct btrfs_key key; 4102 struct btrfs_key key;
4134 struct btrfs_key found_key; 4103 struct btrfs_key found_key;
4135 struct btrfs_path *path; 4104 struct btrfs_path *path;
4105 struct list_head ins_list;
4106 struct list_head del_list;
4136 int ret; 4107 int ret;
4137 u32 nritems;
4138 struct extent_buffer *leaf; 4108 struct extent_buffer *leaf;
4139 int slot; 4109 int slot;
4140 int advance;
4141 unsigned char d_type; 4110 unsigned char d_type;
4142 int over = 0; 4111 int over = 0;
4143 u32 di_cur; 4112 u32 di_cur;
@@ -4147,6 +4116,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4147 char tmp_name[32]; 4116 char tmp_name[32];
4148 char *name_ptr; 4117 char *name_ptr;
4149 int name_len; 4118 int name_len;
4119 int is_curr = 0; /* filp->f_pos points to the current index? */
4150 4120
4151 /* FIXME, use a real flag for deciding about the key type */ 4121 /* FIXME, use a real flag for deciding about the key type */
4152 if (root->fs_info->tree_root == root) 4122 if (root->fs_info->tree_root == root)
@@ -4154,9 +4124,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4154 4124
4155 /* special case for "." */ 4125 /* special case for "." */
4156 if (filp->f_pos == 0) { 4126 if (filp->f_pos == 0) {
4157 over = filldir(dirent, ".", 1, 4127 over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR);
4158 1, inode->i_ino,
4159 DT_DIR);
4160 if (over) 4128 if (over)
4161 return 0; 4129 return 0;
4162 filp->f_pos = 1; 4130 filp->f_pos = 1;
@@ -4171,36 +4139,37 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4171 filp->f_pos = 2; 4139 filp->f_pos = 2;
4172 } 4140 }
4173 path = btrfs_alloc_path(); 4141 path = btrfs_alloc_path();
4174 path->reada = 2; 4142 if (!path)
4143 return -ENOMEM;
4144
4145 path->reada = 1;
4146
4147 if (key_type == BTRFS_DIR_INDEX_KEY) {
4148 INIT_LIST_HEAD(&ins_list);
4149 INIT_LIST_HEAD(&del_list);
4150 btrfs_get_delayed_items(inode, &ins_list, &del_list);
4151 }
4175 4152
4176 btrfs_set_key_type(&key, key_type); 4153 btrfs_set_key_type(&key, key_type);
4177 key.offset = filp->f_pos; 4154 key.offset = filp->f_pos;
4178 key.objectid = inode->i_ino; 4155 key.objectid = btrfs_ino(inode);
4179 4156
4180 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 4157 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4181 if (ret < 0) 4158 if (ret < 0)
4182 goto err; 4159 goto err;
4183 advance = 0;
4184 4160
4185 while (1) { 4161 while (1) {
4186 leaf = path->nodes[0]; 4162 leaf = path->nodes[0];
4187 nritems = btrfs_header_nritems(leaf);
4188 slot = path->slots[0]; 4163 slot = path->slots[0];
4189 if (advance || slot >= nritems) { 4164 if (slot >= btrfs_header_nritems(leaf)) {
4190 if (slot >= nritems - 1) { 4165 ret = btrfs_next_leaf(root, path);
4191 ret = btrfs_next_leaf(root, path); 4166 if (ret < 0)
4192 if (ret) 4167 goto err;
4193 break; 4168 else if (ret > 0)
4194 leaf = path->nodes[0]; 4169 break;
4195 nritems = btrfs_header_nritems(leaf); 4170 continue;
4196 slot = path->slots[0];
4197 } else {
4198 slot++;
4199 path->slots[0]++;
4200 }
4201 } 4171 }
4202 4172
4203 advance = 1;
4204 item = btrfs_item_nr(leaf, slot); 4173 item = btrfs_item_nr(leaf, slot);
4205 btrfs_item_key_to_cpu(leaf, &found_key, slot); 4174 btrfs_item_key_to_cpu(leaf, &found_key, slot);
4206 4175
@@ -4209,9 +4178,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4209 if (btrfs_key_type(&found_key) != key_type) 4178 if (btrfs_key_type(&found_key) != key_type)
4210 break; 4179 break;
4211 if (found_key.offset < filp->f_pos) 4180 if (found_key.offset < filp->f_pos)
4212 continue; 4181 goto next;
4182 if (key_type == BTRFS_DIR_INDEX_KEY &&
4183 btrfs_should_delete_dir_index(&del_list,
4184 found_key.offset))
4185 goto next;
4213 4186
4214 filp->f_pos = found_key.offset; 4187 filp->f_pos = found_key.offset;
4188 is_curr = 1;
4215 4189
4216 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 4190 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
4217 di_cur = 0; 4191 di_cur = 0;
@@ -4220,6 +4194,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
4220 while (di_cur < di_total) { 4194 while (di_cur < di_total) {
4221 struct btrfs_key location; 4195 struct btrfs_key location;
4222 4196
4197 if (verify_dir_item(root, leaf, di))
4198 break;
4199
4223 name_len = btrfs_dir_name_len(leaf, di); 4200 name_len = btrfs_dir_name_len(leaf, di);
4224 if (name_len <= sizeof(tmp_name)) { 4201 if (name_len <= sizeof(tmp_name)) {
4225 name_ptr = tmp_name; 4202 name_ptr = tmp_name;
@@ -4259,6 +4236,17 @@ skip:
4259 di_cur += di_len; 4236 di_cur += di_len;
4260 di = (struct btrfs_dir_item *)((char *)di + di_len); 4237 di = (struct btrfs_dir_item *)((char *)di + di_len);
4261 } 4238 }
4239next:
4240 path->slots[0]++;
4241 }
4242
4243 if (key_type == BTRFS_DIR_INDEX_KEY) {
4244 if (is_curr)
4245 filp->f_pos++;
4246 ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
4247 &ins_list);
4248 if (ret)
4249 goto nopos;
4262 } 4250 }
4263 4251
4264 /* Reached end of directory/root. Bump pos past the last item. */ 4252 /* Reached end of directory/root. Bump pos past the last item. */
@@ -4273,6 +4261,8 @@ skip:
4273nopos: 4261nopos:
4274 ret = 0; 4262 ret = 0;
4275err: 4263err:
4264 if (key_type == BTRFS_DIR_INDEX_KEY)
4265 btrfs_put_delayed_items(&ins_list, &del_list);
4276 btrfs_free_path(path); 4266 btrfs_free_path(path);
4277 return ret; 4267 return ret;
4278} 4268}
@@ -4282,14 +4272,25 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4282 struct btrfs_root *root = BTRFS_I(inode)->root; 4272 struct btrfs_root *root = BTRFS_I(inode)->root;
4283 struct btrfs_trans_handle *trans; 4273 struct btrfs_trans_handle *trans;
4284 int ret = 0; 4274 int ret = 0;
4275 bool nolock = false;
4285 4276
4286 if (BTRFS_I(inode)->dummy_inode) 4277 if (BTRFS_I(inode)->dummy_inode)
4287 return 0; 4278 return 0;
4288 4279
4280 if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
4281 nolock = true;
4282
4289 if (wbc->sync_mode == WB_SYNC_ALL) { 4283 if (wbc->sync_mode == WB_SYNC_ALL) {
4290 trans = btrfs_join_transaction(root, 1); 4284 if (nolock)
4291 btrfs_set_trans_block_group(trans, inode); 4285 trans = btrfs_join_transaction_nolock(root);
4292 ret = btrfs_commit_transaction(trans, root); 4286 else
4287 trans = btrfs_join_transaction(root);
4288 if (IS_ERR(trans))
4289 return PTR_ERR(trans);
4290 if (nolock)
4291 ret = btrfs_end_transaction_nolock(trans, root);
4292 else
4293 ret = btrfs_commit_transaction(trans, root);
4293 } 4294 }
4294 return ret; 4295 return ret;
4295} 4296}
@@ -4300,7 +4301,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
4300 * FIXME, needs more benchmarking...there are no reasons other than performance 4301 * FIXME, needs more benchmarking...there are no reasons other than performance
4301 * to keep or drop this code. 4302 * to keep or drop this code.
4302 */ 4303 */
4303void btrfs_dirty_inode(struct inode *inode) 4304void btrfs_dirty_inode(struct inode *inode, int flags)
4304{ 4305{
4305 struct btrfs_root *root = BTRFS_I(inode)->root; 4306 struct btrfs_root *root = BTRFS_I(inode)->root;
4306 struct btrfs_trans_handle *trans; 4307 struct btrfs_trans_handle *trans;
@@ -4309,8 +4310,8 @@ void btrfs_dirty_inode(struct inode *inode)
4309 if (BTRFS_I(inode)->dummy_inode) 4310 if (BTRFS_I(inode)->dummy_inode)
4310 return; 4311 return;
4311 4312
4312 trans = btrfs_join_transaction(root, 1); 4313 trans = btrfs_join_transaction(root);
4313 btrfs_set_trans_block_group(trans, inode); 4314 BUG_ON(IS_ERR(trans));
4314 4315
4315 ret = btrfs_update_inode(trans, root, inode); 4316 ret = btrfs_update_inode(trans, root, inode);
4316 if (ret && ret == -ENOSPC) { 4317 if (ret && ret == -ENOSPC) {
@@ -4318,25 +4319,24 @@ void btrfs_dirty_inode(struct inode *inode)
4318 btrfs_end_transaction(trans, root); 4319 btrfs_end_transaction(trans, root);
4319 trans = btrfs_start_transaction(root, 1); 4320 trans = btrfs_start_transaction(root, 1);
4320 if (IS_ERR(trans)) { 4321 if (IS_ERR(trans)) {
4321 if (printk_ratelimit()) { 4322 printk_ratelimited(KERN_ERR "btrfs: fail to "
4322 printk(KERN_ERR "btrfs: fail to " 4323 "dirty inode %llu error %ld\n",
4323 "dirty inode %lu error %ld\n", 4324 (unsigned long long)btrfs_ino(inode),
4324 inode->i_ino, PTR_ERR(trans)); 4325 PTR_ERR(trans));
4325 }
4326 return; 4326 return;
4327 } 4327 }
4328 btrfs_set_trans_block_group(trans, inode);
4329 4328
4330 ret = btrfs_update_inode(trans, root, inode); 4329 ret = btrfs_update_inode(trans, root, inode);
4331 if (ret) { 4330 if (ret) {
4332 if (printk_ratelimit()) { 4331 printk_ratelimited(KERN_ERR "btrfs: fail to "
4333 printk(KERN_ERR "btrfs: fail to " 4332 "dirty inode %llu error %d\n",
4334 "dirty inode %lu error %d\n", 4333 (unsigned long long)btrfs_ino(inode),
4335 inode->i_ino, ret); 4334 ret);
4336 }
4337 } 4335 }
4338 } 4336 }
4339 btrfs_end_transaction(trans, root); 4337 btrfs_end_transaction(trans, root);
4338 if (BTRFS_I(inode)->delayed_node)
4339 btrfs_balance_delayed_items(root);
4340} 4340}
4341 4341
4342/* 4342/*
@@ -4352,7 +4352,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
4352 struct extent_buffer *leaf; 4352 struct extent_buffer *leaf;
4353 int ret; 4353 int ret;
4354 4354
4355 key.objectid = inode->i_ino; 4355 key.objectid = btrfs_ino(inode);
4356 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); 4356 btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
4357 key.offset = (u64)-1; 4357 key.offset = (u64)-1;
4358 4358
@@ -4384,7 +4384,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
4384 leaf = path->nodes[0]; 4384 leaf = path->nodes[0];
4385 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 4385 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4386 4386
4387 if (found_key.objectid != inode->i_ino || 4387 if (found_key.objectid != btrfs_ino(inode) ||
4388 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { 4388 btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
4389 BTRFS_I(inode)->index_cnt = 2; 4389 BTRFS_I(inode)->index_cnt = 2;
4390 goto out; 4390 goto out;
@@ -4405,9 +4405,12 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
4405 int ret = 0; 4405 int ret = 0;
4406 4406
4407 if (BTRFS_I(dir)->index_cnt == (u64)-1) { 4407 if (BTRFS_I(dir)->index_cnt == (u64)-1) {
4408 ret = btrfs_set_inode_index_count(dir); 4408 ret = btrfs_inode_delayed_dir_index_count(dir);
4409 if (ret) 4409 if (ret) {
4410 return ret; 4410 ret = btrfs_set_inode_index_count(dir);
4411 if (ret)
4412 return ret;
4413 }
4411 } 4414 }
4412 4415
4413 *index = BTRFS_I(dir)->index_cnt; 4416 *index = BTRFS_I(dir)->index_cnt;
@@ -4420,8 +4423,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4420 struct btrfs_root *root, 4423 struct btrfs_root *root,
4421 struct inode *dir, 4424 struct inode *dir,
4422 const char *name, int name_len, 4425 const char *name, int name_len,
4423 u64 ref_objectid, u64 objectid, 4426 u64 ref_objectid, u64 objectid, int mode,
4424 u64 alloc_hint, int mode, u64 *index) 4427 u64 *index)
4425{ 4428{
4426 struct inode *inode; 4429 struct inode *inode;
4427 struct btrfs_inode_item *inode_item; 4430 struct btrfs_inode_item *inode_item;
@@ -4438,12 +4441,23 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4438 BUG_ON(!path); 4441 BUG_ON(!path);
4439 4442
4440 inode = new_inode(root->fs_info->sb); 4443 inode = new_inode(root->fs_info->sb);
4441 if (!inode) 4444 if (!inode) {
4445 btrfs_free_path(path);
4442 return ERR_PTR(-ENOMEM); 4446 return ERR_PTR(-ENOMEM);
4447 }
4448
4449 /*
4450 * we have to initialize this early, so we can reclaim the inode
4451 * number if we fail afterwards in this function.
4452 */
4453 inode->i_ino = objectid;
4443 4454
4444 if (dir) { 4455 if (dir) {
4456 trace_btrfs_inode_request(dir);
4457
4445 ret = btrfs_set_inode_index(dir, index); 4458 ret = btrfs_set_inode_index(dir, index);
4446 if (ret) { 4459 if (ret) {
4460 btrfs_free_path(path);
4447 iput(inode); 4461 iput(inode);
4448 return ERR_PTR(ret); 4462 return ERR_PTR(ret);
4449 } 4463 }
@@ -4456,14 +4470,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4456 BTRFS_I(inode)->index_cnt = 2; 4470 BTRFS_I(inode)->index_cnt = 2;
4457 BTRFS_I(inode)->root = root; 4471 BTRFS_I(inode)->root = root;
4458 BTRFS_I(inode)->generation = trans->transid; 4472 BTRFS_I(inode)->generation = trans->transid;
4473 inode->i_generation = BTRFS_I(inode)->generation;
4459 btrfs_set_inode_space_info(root, inode); 4474 btrfs_set_inode_space_info(root, inode);
4460 4475
4461 if (mode & S_IFDIR) 4476 if (mode & S_IFDIR)
4462 owner = 0; 4477 owner = 0;
4463 else 4478 else
4464 owner = 1; 4479 owner = 1;
4465 BTRFS_I(inode)->block_group =
4466 btrfs_find_block_group(root, 0, alloc_hint, owner);
4467 4480
4468 key[0].objectid = objectid; 4481 key[0].objectid = objectid;
4469 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); 4482 btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4482,7 +4495,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4482 goto fail; 4495 goto fail;
4483 4496
4484 inode_init_owner(inode, dir, mode); 4497 inode_init_owner(inode, dir, mode);
4485 inode->i_ino = objectid;
4486 inode_set_bytes(inode, 0); 4498 inode_set_bytes(inode, 0);
4487 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; 4499 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4488 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], 4500 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -4509,12 +4521,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
4509 if ((mode & S_IFREG)) { 4521 if ((mode & S_IFREG)) {
4510 if (btrfs_test_opt(root, NODATASUM)) 4522 if (btrfs_test_opt(root, NODATASUM))
4511 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; 4523 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
4512 if (btrfs_test_opt(root, NODATACOW)) 4524 if (btrfs_test_opt(root, NODATACOW) ||
4525 (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
4513 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; 4526 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
4514 } 4527 }
4515 4528
4516 insert_inode_hash(inode); 4529 insert_inode_hash(inode);
4517 inode_tree_add(inode); 4530 inode_tree_add(inode);
4531
4532 trace_btrfs_inode_new(inode);
4533 btrfs_set_inode_last_trans(trans, inode);
4534
4518 return inode; 4535 return inode;
4519fail: 4536fail:
4520 if (dir) 4537 if (dir)
@@ -4542,29 +4559,29 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4542 int ret = 0; 4559 int ret = 0;
4543 struct btrfs_key key; 4560 struct btrfs_key key;
4544 struct btrfs_root *root = BTRFS_I(parent_inode)->root; 4561 struct btrfs_root *root = BTRFS_I(parent_inode)->root;
4562 u64 ino = btrfs_ino(inode);
4563 u64 parent_ino = btrfs_ino(parent_inode);
4545 4564
4546 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 4565 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4547 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); 4566 memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
4548 } else { 4567 } else {
4549 key.objectid = inode->i_ino; 4568 key.objectid = ino;
4550 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 4569 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
4551 key.offset = 0; 4570 key.offset = 0;
4552 } 4571 }
4553 4572
4554 if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 4573 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
4555 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 4574 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
4556 key.objectid, root->root_key.objectid, 4575 key.objectid, root->root_key.objectid,
4557 parent_inode->i_ino, 4576 parent_ino, index, name, name_len);
4558 index, name, name_len);
4559 } else if (add_backref) { 4577 } else if (add_backref) {
4560 ret = btrfs_insert_inode_ref(trans, root, 4578 ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
4561 name, name_len, inode->i_ino, 4579 parent_ino, index);
4562 parent_inode->i_ino, index);
4563 } 4580 }
4564 4581
4565 if (ret == 0) { 4582 if (ret == 0) {
4566 ret = btrfs_insert_dir_item(trans, root, name, name_len, 4583 ret = btrfs_insert_dir_item(trans, root, name, name_len,
4567 parent_inode->i_ino, &key, 4584 parent_inode, &key,
4568 btrfs_inode_type(inode), index); 4585 btrfs_inode_type(inode), index);
4569 BUG_ON(ret); 4586 BUG_ON(ret);
4570 4587
@@ -4577,12 +4594,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
4577} 4594}
4578 4595
4579static int btrfs_add_nondir(struct btrfs_trans_handle *trans, 4596static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
4580 struct dentry *dentry, struct inode *inode, 4597 struct inode *dir, struct dentry *dentry,
4581 int backref, u64 index) 4598 struct inode *inode, int backref, u64 index)
4582{ 4599{
4583 int err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4600 int err = btrfs_add_link(trans, dir, inode,
4584 inode, dentry->d_name.name, 4601 dentry->d_name.name, dentry->d_name.len,
4585 dentry->d_name.len, backref, index); 4602 backref, index);
4586 if (!err) { 4603 if (!err) {
4587 d_instantiate(dentry, inode); 4604 d_instantiate(dentry, inode);
4588 return 0; 4605 return 0;
@@ -4607,10 +4624,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4607 if (!new_valid_dev(rdev)) 4624 if (!new_valid_dev(rdev))
4608 return -EINVAL; 4625 return -EINVAL;
4609 4626
4610 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4611 if (err)
4612 return err;
4613
4614 /* 4627 /*
4615 * 2 for inode item and ref 4628 * 2 for inode item and ref
4616 * 2 for dir items 4629 * 2 for dir items
@@ -4620,24 +4633,25 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4620 if (IS_ERR(trans)) 4633 if (IS_ERR(trans))
4621 return PTR_ERR(trans); 4634 return PTR_ERR(trans);
4622 4635
4623 btrfs_set_trans_block_group(trans, dir); 4636 err = btrfs_find_free_ino(root, &objectid);
4637 if (err)
4638 goto out_unlock;
4624 4639
4625 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4640 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4626 dentry->d_name.len, 4641 dentry->d_name.len, btrfs_ino(dir), objectid,
4627 dentry->d_parent->d_inode->i_ino, objectid, 4642 mode, &index);
4628 BTRFS_I(dir)->block_group, mode, &index); 4643 if (IS_ERR(inode)) {
4629 err = PTR_ERR(inode); 4644 err = PTR_ERR(inode);
4630 if (IS_ERR(inode))
4631 goto out_unlock; 4645 goto out_unlock;
4646 }
4632 4647
4633 err = btrfs_init_inode_security(trans, inode, dir); 4648 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4634 if (err) { 4649 if (err) {
4635 drop_inode = 1; 4650 drop_inode = 1;
4636 goto out_unlock; 4651 goto out_unlock;
4637 } 4652 }
4638 4653
4639 btrfs_set_trans_block_group(trans, inode); 4654 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4640 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4641 if (err) 4655 if (err)
4642 drop_inode = 1; 4656 drop_inode = 1;
4643 else { 4657 else {
@@ -4645,8 +4659,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
4645 init_special_inode(inode, inode->i_mode, rdev); 4659 init_special_inode(inode, inode->i_mode, rdev);
4646 btrfs_update_inode(trans, root, inode); 4660 btrfs_update_inode(trans, root, inode);
4647 } 4661 }
4648 btrfs_update_inode_block_group(trans, inode);
4649 btrfs_update_inode_block_group(trans, dir);
4650out_unlock: 4662out_unlock:
4651 nr = trans->blocks_used; 4663 nr = trans->blocks_used;
4652 btrfs_end_transaction_throttle(trans, root); 4664 btrfs_end_transaction_throttle(trans, root);
@@ -4670,9 +4682,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4670 u64 objectid; 4682 u64 objectid;
4671 u64 index = 0; 4683 u64 index = 0;
4672 4684
4673 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4674 if (err)
4675 return err;
4676 /* 4685 /*
4677 * 2 for inode item and ref 4686 * 2 for inode item and ref
4678 * 2 for dir items 4687 * 2 for dir items
@@ -4682,25 +4691,25 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4682 if (IS_ERR(trans)) 4691 if (IS_ERR(trans))
4683 return PTR_ERR(trans); 4692 return PTR_ERR(trans);
4684 4693
4685 btrfs_set_trans_block_group(trans, dir); 4694 err = btrfs_find_free_ino(root, &objectid);
4695 if (err)
4696 goto out_unlock;
4686 4697
4687 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4698 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4688 dentry->d_name.len, 4699 dentry->d_name.len, btrfs_ino(dir), objectid,
4689 dentry->d_parent->d_inode->i_ino, 4700 mode, &index);
4690 objectid, BTRFS_I(dir)->block_group, mode, 4701 if (IS_ERR(inode)) {
4691 &index); 4702 err = PTR_ERR(inode);
4692 err = PTR_ERR(inode);
4693 if (IS_ERR(inode))
4694 goto out_unlock; 4703 goto out_unlock;
4704 }
4695 4705
4696 err = btrfs_init_inode_security(trans, inode, dir); 4706 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4697 if (err) { 4707 if (err) {
4698 drop_inode = 1; 4708 drop_inode = 1;
4699 goto out_unlock; 4709 goto out_unlock;
4700 } 4710 }
4701 4711
4702 btrfs_set_trans_block_group(trans, inode); 4712 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
4703 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
4704 if (err) 4713 if (err)
4705 drop_inode = 1; 4714 drop_inode = 1;
4706 else { 4715 else {
@@ -4710,8 +4719,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
4710 inode->i_op = &btrfs_file_inode_operations; 4719 inode->i_op = &btrfs_file_inode_operations;
4711 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 4720 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
4712 } 4721 }
4713 btrfs_update_inode_block_group(trans, inode);
4714 btrfs_update_inode_block_group(trans, dir);
4715out_unlock: 4722out_unlock:
4716 nr = trans->blocks_used; 4723 nr = trans->blocks_used;
4717 btrfs_end_transaction_throttle(trans, root); 4724 btrfs_end_transaction_throttle(trans, root);
@@ -4734,41 +4741,42 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
4734 int err; 4741 int err;
4735 int drop_inode = 0; 4742 int drop_inode = 0;
4736 4743
4737 if (inode->i_nlink == 0)
4738 return -ENOENT;
4739
4740 /* do not allow sys_link's with other subvols of the same device */ 4744 /* do not allow sys_link's with other subvols of the same device */
4741 if (root->objectid != BTRFS_I(inode)->root->objectid) 4745 if (root->objectid != BTRFS_I(inode)->root->objectid)
4742 return -EPERM; 4746 return -EXDEV;
4743 4747
4744 btrfs_inc_nlink(inode); 4748 if (inode->i_nlink == ~0U)
4749 return -EMLINK;
4745 4750
4746 err = btrfs_set_inode_index(dir, &index); 4751 err = btrfs_set_inode_index(dir, &index);
4747 if (err) 4752 if (err)
4748 goto fail; 4753 goto fail;
4749 4754
4750 /* 4755 /*
4751 * 1 item for inode ref 4756 * 2 items for inode and inode ref
4752 * 2 items for dir items 4757 * 2 items for dir items
4758 * 1 item for parent inode
4753 */ 4759 */
4754 trans = btrfs_start_transaction(root, 3); 4760 trans = btrfs_start_transaction(root, 5);
4755 if (IS_ERR(trans)) { 4761 if (IS_ERR(trans)) {
4756 err = PTR_ERR(trans); 4762 err = PTR_ERR(trans);
4757 goto fail; 4763 goto fail;
4758 } 4764 }
4759 4765
4760 btrfs_set_trans_block_group(trans, dir); 4766 btrfs_inc_nlink(inode);
4761 atomic_inc(&inode->i_count); 4767 inode->i_ctime = CURRENT_TIME;
4768 ihold(inode);
4762 4769
4763 err = btrfs_add_nondir(trans, dentry, inode, 1, index); 4770 err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
4764 4771
4765 if (err) { 4772 if (err) {
4766 drop_inode = 1; 4773 drop_inode = 1;
4767 } else { 4774 } else {
4768 btrfs_update_inode_block_group(trans, dir); 4775 struct dentry *parent = dget_parent(dentry);
4769 err = btrfs_update_inode(trans, root, inode); 4776 err = btrfs_update_inode(trans, root, inode);
4770 BUG_ON(err); 4777 BUG_ON(err);
4771 btrfs_log_new_name(trans, inode, NULL, dentry->d_parent); 4778 btrfs_log_new_name(trans, inode, NULL, parent);
4779 dput(parent);
4772 } 4780 }
4773 4781
4774 nr = trans->blocks_used; 4782 nr = trans->blocks_used;
@@ -4793,10 +4801,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4793 u64 index = 0; 4801 u64 index = 0;
4794 unsigned long nr = 1; 4802 unsigned long nr = 1;
4795 4803
4796 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
4797 if (err)
4798 return err;
4799
4800 /* 4804 /*
4801 * 2 items for inode and ref 4805 * 2 items for inode and ref
4802 * 2 items for dir items 4806 * 2 items for dir items
@@ -4805,13 +4809,14 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4805 trans = btrfs_start_transaction(root, 5); 4809 trans = btrfs_start_transaction(root, 5);
4806 if (IS_ERR(trans)) 4810 if (IS_ERR(trans))
4807 return PTR_ERR(trans); 4811 return PTR_ERR(trans);
4808 btrfs_set_trans_block_group(trans, dir); 4812
4813 err = btrfs_find_free_ino(root, &objectid);
4814 if (err)
4815 goto out_fail;
4809 4816
4810 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4817 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
4811 dentry->d_name.len, 4818 dentry->d_name.len, btrfs_ino(dir), objectid,
4812 dentry->d_parent->d_inode->i_ino, objectid, 4819 S_IFDIR | mode, &index);
4813 BTRFS_I(dir)->block_group, S_IFDIR | mode,
4814 &index);
4815 if (IS_ERR(inode)) { 4820 if (IS_ERR(inode)) {
4816 err = PTR_ERR(inode); 4821 err = PTR_ERR(inode);
4817 goto out_fail; 4822 goto out_fail;
@@ -4819,29 +4824,25 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4819 4824
4820 drop_on_err = 1; 4825 drop_on_err = 1;
4821 4826
4822 err = btrfs_init_inode_security(trans, inode, dir); 4827 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
4823 if (err) 4828 if (err)
4824 goto out_fail; 4829 goto out_fail;
4825 4830
4826 inode->i_op = &btrfs_dir_inode_operations; 4831 inode->i_op = &btrfs_dir_inode_operations;
4827 inode->i_fop = &btrfs_dir_file_operations; 4832 inode->i_fop = &btrfs_dir_file_operations;
4828 btrfs_set_trans_block_group(trans, inode);
4829 4833
4830 btrfs_i_size_write(inode, 0); 4834 btrfs_i_size_write(inode, 0);
4831 err = btrfs_update_inode(trans, root, inode); 4835 err = btrfs_update_inode(trans, root, inode);
4832 if (err) 4836 if (err)
4833 goto out_fail; 4837 goto out_fail;
4834 4838
4835 err = btrfs_add_link(trans, dentry->d_parent->d_inode, 4839 err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
4836 inode, dentry->d_name.name, 4840 dentry->d_name.len, 0, index);
4837 dentry->d_name.len, 0, index);
4838 if (err) 4841 if (err)
4839 goto out_fail; 4842 goto out_fail;
4840 4843
4841 d_instantiate(dentry, inode); 4844 d_instantiate(dentry, inode);
4842 drop_on_err = 0; 4845 drop_on_err = 0;
4843 btrfs_update_inode_block_group(trans, inode);
4844 btrfs_update_inode_block_group(trans, dir);
4845 4846
4846out_fail: 4847out_fail:
4847 nr = trans->blocks_used; 4848 nr = trans->blocks_used;
@@ -4886,19 +4887,23 @@ static noinline int uncompress_inline(struct btrfs_path *path,
4886 size_t max_size; 4887 size_t max_size;
4887 unsigned long inline_size; 4888 unsigned long inline_size;
4888 unsigned long ptr; 4889 unsigned long ptr;
4890 int compress_type;
4889 4891
4890 WARN_ON(pg_offset != 0); 4892 WARN_ON(pg_offset != 0);
4893 compress_type = btrfs_file_extent_compression(leaf, item);
4891 max_size = btrfs_file_extent_ram_bytes(leaf, item); 4894 max_size = btrfs_file_extent_ram_bytes(leaf, item);
4892 inline_size = btrfs_file_extent_inline_item_len(leaf, 4895 inline_size = btrfs_file_extent_inline_item_len(leaf,
4893 btrfs_item_nr(leaf, path->slots[0])); 4896 btrfs_item_nr(leaf, path->slots[0]));
4894 tmp = kmalloc(inline_size, GFP_NOFS); 4897 tmp = kmalloc(inline_size, GFP_NOFS);
4898 if (!tmp)
4899 return -ENOMEM;
4895 ptr = btrfs_file_extent_inline_start(item); 4900 ptr = btrfs_file_extent_inline_start(item);
4896 4901
4897 read_extent_buffer(leaf, tmp, ptr, inline_size); 4902 read_extent_buffer(leaf, tmp, ptr, inline_size);
4898 4903
4899 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); 4904 max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
4900 ret = btrfs_zlib_decompress(tmp, page, extent_offset, 4905 ret = btrfs_decompress(compress_type, tmp, page,
4901 inline_size, max_size); 4906 extent_offset, inline_size, max_size);
4902 if (ret) { 4907 if (ret) {
4903 char *kaddr = kmap_atomic(page, KM_USER0); 4908 char *kaddr = kmap_atomic(page, KM_USER0);
4904 unsigned long copy_size = min_t(u64, 4909 unsigned long copy_size = min_t(u64,
@@ -4929,7 +4934,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4929 u64 bytenr; 4934 u64 bytenr;
4930 u64 extent_start = 0; 4935 u64 extent_start = 0;
4931 u64 extent_end = 0; 4936 u64 extent_end = 0;
4932 u64 objectid = inode->i_ino; 4937 u64 objectid = btrfs_ino(inode);
4933 u32 found_type; 4938 u32 found_type;
4934 struct btrfs_path *path = NULL; 4939 struct btrfs_path *path = NULL;
4935 struct btrfs_root *root = BTRFS_I(inode)->root; 4940 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4940,7 +4945,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
4940 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 4945 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4941 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 4946 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4942 struct btrfs_trans_handle *trans = NULL; 4947 struct btrfs_trans_handle *trans = NULL;
4943 int compressed; 4948 int compress_type;
4944 4949
4945again: 4950again:
4946 read_lock(&em_tree->lock); 4951 read_lock(&em_tree->lock);
@@ -4957,7 +4962,7 @@ again:
4957 else 4962 else
4958 goto out; 4963 goto out;
4959 } 4964 }
4960 em = alloc_extent_map(GFP_NOFS); 4965 em = alloc_extent_map();
4961 if (!em) { 4966 if (!em) {
4962 err = -ENOMEM; 4967 err = -ENOMEM;
4963 goto out; 4968 goto out;
@@ -4970,7 +4975,15 @@ again:
4970 4975
4971 if (!path) { 4976 if (!path) {
4972 path = btrfs_alloc_path(); 4977 path = btrfs_alloc_path();
4973 BUG_ON(!path); 4978 if (!path) {
4979 err = -ENOMEM;
4980 goto out;
4981 }
4982 /*
4983 * Chances are we'll be called again, so go ahead and do
4984 * readahead
4985 */
4986 path->reada = 1;
4974 } 4987 }
4975 4988
4976 ret = btrfs_lookup_file_extent(trans, root, path, 4989 ret = btrfs_lookup_file_extent(trans, root, path,
@@ -4999,7 +5012,7 @@ again:
4999 5012
5000 found_type = btrfs_file_extent_type(leaf, item); 5013 found_type = btrfs_file_extent_type(leaf, item);
5001 extent_start = found_key.offset; 5014 extent_start = found_key.offset;
5002 compressed = btrfs_file_extent_compression(leaf, item); 5015 compress_type = btrfs_file_extent_compression(leaf, item);
5003 if (found_type == BTRFS_FILE_EXTENT_REG || 5016 if (found_type == BTRFS_FILE_EXTENT_REG ||
5004 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 5017 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5005 extent_end = extent_start + 5018 extent_end = extent_start +
@@ -5045,8 +5058,9 @@ again:
5045 em->block_start = EXTENT_MAP_HOLE; 5058 em->block_start = EXTENT_MAP_HOLE;
5046 goto insert; 5059 goto insert;
5047 } 5060 }
5048 if (compressed) { 5061 if (compress_type != BTRFS_COMPRESS_NONE) {
5049 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5062 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5063 em->compress_type = compress_type;
5050 em->block_start = bytenr; 5064 em->block_start = bytenr;
5051 em->block_len = btrfs_file_extent_disk_num_bytes(leaf, 5065 em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
5052 item); 5066 item);
@@ -5080,12 +5094,14 @@ again:
5080 em->len = (copy_size + root->sectorsize - 1) & 5094 em->len = (copy_size + root->sectorsize - 1) &
5081 ~((u64)root->sectorsize - 1); 5095 ~((u64)root->sectorsize - 1);
5082 em->orig_start = EXTENT_MAP_INLINE; 5096 em->orig_start = EXTENT_MAP_INLINE;
5083 if (compressed) 5097 if (compress_type) {
5084 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); 5098 set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5099 em->compress_type = compress_type;
5100 }
5085 ptr = btrfs_file_extent_inline_start(item) + extent_offset; 5101 ptr = btrfs_file_extent_inline_start(item) + extent_offset;
5086 if (create == 0 && !PageUptodate(page)) { 5102 if (create == 0 && !PageUptodate(page)) {
5087 if (btrfs_file_extent_compression(leaf, item) == 5103 if (btrfs_file_extent_compression(leaf, item) !=
5088 BTRFS_COMPRESS_ZLIB) { 5104 BTRFS_COMPRESS_NONE) {
5089 ret = uncompress_inline(path, inode, page, 5105 ret = uncompress_inline(path, inode, page,
5090 pg_offset, 5106 pg_offset,
5091 extent_offset, item); 5107 extent_offset, item);
@@ -5108,8 +5124,12 @@ again:
5108 kunmap(page); 5124 kunmap(page);
5109 free_extent_map(em); 5125 free_extent_map(em);
5110 em = NULL; 5126 em = NULL;
5111 btrfs_release_path(root, path); 5127
5112 trans = btrfs_join_transaction(root, 1); 5128 btrfs_release_path(path);
5129 trans = btrfs_join_transaction(root);
5130
5131 if (IS_ERR(trans))
5132 return ERR_CAST(trans);
5113 goto again; 5133 goto again;
5114 } 5134 }
5115 map = kmap(page); 5135 map = kmap(page);
@@ -5119,7 +5139,7 @@ again:
5119 btrfs_mark_buffer_dirty(leaf); 5139 btrfs_mark_buffer_dirty(leaf);
5120 } 5140 }
5121 set_extent_uptodate(io_tree, em->start, 5141 set_extent_uptodate(io_tree, em->start,
5122 extent_map_end(em) - 1, GFP_NOFS); 5142 extent_map_end(em) - 1, NULL, GFP_NOFS);
5123 goto insert; 5143 goto insert;
5124 } else { 5144 } else {
5125 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); 5145 printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5132,7 +5152,7 @@ not_found_em:
5132 em->block_start = EXTENT_MAP_HOLE; 5152 em->block_start = EXTENT_MAP_HOLE;
5133 set_bit(EXTENT_FLAG_VACANCY, &em->flags); 5153 set_bit(EXTENT_FLAG_VACANCY, &em->flags);
5134insert: 5154insert:
5135 btrfs_release_path(root, path); 5155 btrfs_release_path(path);
5136 if (em->start > start || extent_map_end(em) <= start) { 5156 if (em->start > start || extent_map_end(em) <= start) {
5137 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " 5157 printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
5138 "[%llu %llu]\n", (unsigned long long)em->start, 5158 "[%llu %llu]\n", (unsigned long long)em->start,
@@ -5186,6 +5206,9 @@ insert:
5186 } 5206 }
5187 write_unlock(&em_tree->lock); 5207 write_unlock(&em_tree->lock);
5188out: 5208out:
5209
5210 trace_btrfs_get_extent(root, em);
5211
5189 if (path) 5212 if (path)
5190 btrfs_free_path(path); 5213 btrfs_free_path(path);
5191 if (trans) { 5214 if (trans) {
@@ -5200,22 +5223,160 @@ out:
5200 return em; 5223 return em;
5201} 5224}
5202 5225
5226struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
5227 size_t pg_offset, u64 start, u64 len,
5228 int create)
5229{
5230 struct extent_map *em;
5231 struct extent_map *hole_em = NULL;
5232 u64 range_start = start;
5233 u64 end;
5234 u64 found;
5235 u64 found_end;
5236 int err = 0;
5237
5238 em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
5239 if (IS_ERR(em))
5240 return em;
5241 if (em) {
5242 /*
5243 * if our em maps to a hole, there might
5244 * actually be delalloc bytes behind it
5245 */
5246 if (em->block_start != EXTENT_MAP_HOLE)
5247 return em;
5248 else
5249 hole_em = em;
5250 }
5251
5252 /* check to see if we've wrapped (len == -1 or similar) */
5253 end = start + len;
5254 if (end < start)
5255 end = (u64)-1;
5256 else
5257 end -= 1;
5258
5259 em = NULL;
5260
5261 /* ok, we didn't find anything, lets look for delalloc */
5262 found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
5263 end, len, EXTENT_DELALLOC, 1);
5264 found_end = range_start + found;
5265 if (found_end < range_start)
5266 found_end = (u64)-1;
5267
5268 /*
5269 * we didn't find anything useful, return
5270 * the original results from get_extent()
5271 */
5272 if (range_start > end || found_end <= start) {
5273 em = hole_em;
5274 hole_em = NULL;
5275 goto out;
5276 }
5277
5278 /* adjust the range_start to make sure it doesn't
5279 * go backwards from the start they passed in
5280 */
5281 range_start = max(start,range_start);
5282 found = found_end - range_start;
5283
5284 if (found > 0) {
5285 u64 hole_start = start;
5286 u64 hole_len = len;
5287
5288 em = alloc_extent_map();
5289 if (!em) {
5290 err = -ENOMEM;
5291 goto out;
5292 }
5293 /*
5294 * when btrfs_get_extent can't find anything it
5295 * returns one huge hole
5296 *
5297 * make sure what it found really fits our range, and
5298 * adjust to make sure it is based on the start from
5299 * the caller
5300 */
5301 if (hole_em) {
5302 u64 calc_end = extent_map_end(hole_em);
5303
5304 if (calc_end <= start || (hole_em->start > end)) {
5305 free_extent_map(hole_em);
5306 hole_em = NULL;
5307 } else {
5308 hole_start = max(hole_em->start, start);
5309 hole_len = calc_end - hole_start;
5310 }
5311 }
5312 em->bdev = NULL;
5313 if (hole_em && range_start > hole_start) {
5314 /* our hole starts before our delalloc, so we
5315 * have to return just the parts of the hole
5316 * that go until the delalloc starts
5317 */
5318 em->len = min(hole_len,
5319 range_start - hole_start);
5320 em->start = hole_start;
5321 em->orig_start = hole_start;
5322 /*
5323 * don't adjust block start at all,
5324 * it is fixed at EXTENT_MAP_HOLE
5325 */
5326 em->block_start = hole_em->block_start;
5327 em->block_len = hole_len;
5328 } else {
5329 em->start = range_start;
5330 em->len = found;
5331 em->orig_start = range_start;
5332 em->block_start = EXTENT_MAP_DELALLOC;
5333 em->block_len = found;
5334 }
5335 } else if (hole_em) {
5336 return hole_em;
5337 }
5338out:
5339
5340 free_extent_map(hole_em);
5341 if (err) {
5342 free_extent_map(em);
5343 return ERR_PTR(err);
5344 }
5345 return em;
5346}
5347
5203static struct extent_map *btrfs_new_extent_direct(struct inode *inode, 5348static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5349 struct extent_map *em,
5204 u64 start, u64 len) 5350 u64 start, u64 len)
5205{ 5351{
5206 struct btrfs_root *root = BTRFS_I(inode)->root; 5352 struct btrfs_root *root = BTRFS_I(inode)->root;
5207 struct btrfs_trans_handle *trans; 5353 struct btrfs_trans_handle *trans;
5208 struct extent_map *em;
5209 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 5354 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5210 struct btrfs_key ins; 5355 struct btrfs_key ins;
5211 u64 alloc_hint; 5356 u64 alloc_hint;
5212 int ret; 5357 int ret;
5358 bool insert = false;
5213 5359
5214 btrfs_drop_extent_cache(inode, start, start + len - 1, 0); 5360 /*
5361 * Ok if the extent map we looked up is a hole and is for the exact
5362 * range we want, there is no reason to allocate a new one, however if
5363 * it is not right then we need to free this one and drop the cache for
5364 * our range.
5365 */
5366 if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
5367 em->len != len) {
5368 free_extent_map(em);
5369 em = NULL;
5370 insert = true;
5371 btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
5372 }
5215 5373
5216 trans = btrfs_join_transaction(root, 0); 5374 trans = btrfs_join_transaction(root);
5217 if (!trans) 5375 if (IS_ERR(trans))
5218 return ERR_PTR(-ENOMEM); 5376 return ERR_CAST(trans);
5377
5378 if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
5379 btrfs_add_inode_defrag(trans, inode);
5219 5380
5220 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 5381 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
5221 5382
@@ -5227,10 +5388,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5227 goto out; 5388 goto out;
5228 } 5389 }
5229 5390
5230 em = alloc_extent_map(GFP_NOFS);
5231 if (!em) { 5391 if (!em) {
5232 em = ERR_PTR(-ENOMEM); 5392 em = alloc_extent_map();
5233 goto out; 5393 if (!em) {
5394 em = ERR_PTR(-ENOMEM);
5395 goto out;
5396 }
5234 } 5397 }
5235 5398
5236 em->start = start; 5399 em->start = start;
@@ -5240,9 +5403,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
5240 em->block_start = ins.objectid; 5403 em->block_start = ins.objectid;
5241 em->block_len = ins.offset; 5404 em->block_len = ins.offset;
5242 em->bdev = root->fs_info->fs_devices->latest_bdev; 5405 em->bdev = root->fs_info->fs_devices->latest_bdev;
5406
5407 /*
5408 * We need to do this because if we're using the original em we searched
5409 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
5410 */
5411 em->flags = 0;
5243 set_bit(EXTENT_FLAG_PINNED, &em->flags); 5412 set_bit(EXTENT_FLAG_PINNED, &em->flags);
5244 5413
5245 while (1) { 5414 while (insert) {
5246 write_lock(&em_tree->lock); 5415 write_lock(&em_tree->lock);
5247 ret = add_extent_mapping(em_tree, em); 5416 ret = add_extent_mapping(em_tree, em);
5248 write_unlock(&em_tree->lock); 5417 write_unlock(&em_tree->lock);
@@ -5286,7 +5455,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5286 if (!path) 5455 if (!path)
5287 return -ENOMEM; 5456 return -ENOMEM;
5288 5457
5289 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 5458 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
5290 offset, 0); 5459 offset, 0);
5291 if (ret < 0) 5460 if (ret < 0)
5292 goto out; 5461 goto out;
@@ -5303,7 +5472,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5303 ret = 0; 5472 ret = 0;
5304 leaf = path->nodes[0]; 5473 leaf = path->nodes[0];
5305 btrfs_item_key_to_cpu(leaf, &key, slot); 5474 btrfs_item_key_to_cpu(leaf, &key, slot);
5306 if (key.objectid != inode->i_ino || 5475 if (key.objectid != btrfs_ino(inode) ||
5307 key.type != BTRFS_EXTENT_DATA_KEY) { 5476 key.type != BTRFS_EXTENT_DATA_KEY) {
5308 /* not our file or wrong item type, must cow */ 5477 /* not our file or wrong item type, must cow */
5309 goto out; 5478 goto out;
@@ -5337,7 +5506,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
5337 * look for other files referencing this extent, if we 5506 * look for other files referencing this extent, if we
5338 * find any we must cow 5507 * find any we must cow
5339 */ 5508 */
5340 if (btrfs_cross_ref_exist(trans, root, inode->i_ino, 5509 if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
5341 key.offset - backref_offset, disk_bytenr)) 5510 key.offset - backref_offset, disk_bytenr))
5342 goto out; 5511 goto out;
5343 5512
@@ -5438,8 +5607,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
5438 * to make sure the current transaction stays open 5607 * to make sure the current transaction stays open
5439 * while we look for nocow cross refs 5608 * while we look for nocow cross refs
5440 */ 5609 */
5441 trans = btrfs_join_transaction(root, 0); 5610 trans = btrfs_join_transaction(root);
5442 if (!trans) 5611 if (IS_ERR(trans))
5443 goto must_cow; 5612 goto must_cow;
5444 5613
5445 if (can_nocow_odirect(trans, inode, start, len) == 1) { 5614 if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5460,8 +5629,7 @@ must_cow:
5460 * it above 5629 * it above
5461 */ 5630 */
5462 len = bh_result->b_size; 5631 len = bh_result->b_size;
5463 free_extent_map(em); 5632 em = btrfs_new_extent_direct(inode, em, start, len);
5464 em = btrfs_new_extent_direct(inode, start, len);
5465 if (IS_ERR(em)) 5633 if (IS_ERR(em))
5466 return PTR_ERR(em); 5634 return PTR_ERR(em);
5467 len = min(len, em->len - (start - em->start)); 5635 len = min(len, em->len - (start - em->start));
@@ -5490,13 +5658,21 @@ struct btrfs_dio_private {
5490 u64 bytes; 5658 u64 bytes;
5491 u32 *csums; 5659 u32 *csums;
5492 void *private; 5660 void *private;
5661
5662 /* number of bios pending for this dio */
5663 atomic_t pending_bios;
5664
5665 /* IO errors */
5666 int errors;
5667
5668 struct bio *orig_bio;
5493}; 5669};
5494 5670
5495static void btrfs_endio_direct_read(struct bio *bio, int err) 5671static void btrfs_endio_direct_read(struct bio *bio, int err)
5496{ 5672{
5673 struct btrfs_dio_private *dip = bio->bi_private;
5497 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1; 5674 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
5498 struct bio_vec *bvec = bio->bi_io_vec; 5675 struct bio_vec *bvec = bio->bi_io_vec;
5499 struct btrfs_dio_private *dip = bio->bi_private;
5500 struct inode *inode = dip->inode; 5676 struct inode *inode = dip->inode;
5501 struct btrfs_root *root = BTRFS_I(inode)->root; 5677 struct btrfs_root *root = BTRFS_I(inode)->root;
5502 u64 start; 5678 u64 start;
@@ -5520,9 +5696,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5520 5696
5521 flush_dcache_page(bvec->bv_page); 5697 flush_dcache_page(bvec->bv_page);
5522 if (csum != *private) { 5698 if (csum != *private) {
5523 printk(KERN_ERR "btrfs csum failed ino %lu off" 5699 printk(KERN_ERR "btrfs csum failed ino %llu off"
5524 " %llu csum %u private %u\n", 5700 " %llu csum %u private %u\n",
5525 inode->i_ino, (unsigned long long)start, 5701 (unsigned long long)btrfs_ino(inode),
5702 (unsigned long long)start,
5526 csum, *private); 5703 csum, *private);
5527 err = -EIO; 5704 err = -EIO;
5528 } 5705 }
@@ -5539,6 +5716,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
5539 5716
5540 kfree(dip->csums); 5717 kfree(dip->csums);
5541 kfree(dip); 5718 kfree(dip);
5719
5720 /* If we had a csum failure make sure to clear the uptodate flag */
5721 if (err)
5722 clear_bit(BIO_UPTODATE, &bio->bi_flags);
5542 dio_end_io(bio, err); 5723 dio_end_io(bio, err);
5543} 5724}
5544 5725
@@ -5550,20 +5731,23 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5550 struct btrfs_trans_handle *trans; 5731 struct btrfs_trans_handle *trans;
5551 struct btrfs_ordered_extent *ordered = NULL; 5732 struct btrfs_ordered_extent *ordered = NULL;
5552 struct extent_state *cached_state = NULL; 5733 struct extent_state *cached_state = NULL;
5734 u64 ordered_offset = dip->logical_offset;
5735 u64 ordered_bytes = dip->bytes;
5553 int ret; 5736 int ret;
5554 5737
5555 if (err) 5738 if (err)
5556 goto out_done; 5739 goto out_done;
5557 5740again:
5558 ret = btrfs_dec_test_ordered_pending(inode, &ordered, 5741 ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
5559 dip->logical_offset, dip->bytes); 5742 &ordered_offset,
5743 ordered_bytes);
5560 if (!ret) 5744 if (!ret)
5561 goto out_done; 5745 goto out_test;
5562 5746
5563 BUG_ON(!ordered); 5747 BUG_ON(!ordered);
5564 5748
5565 trans = btrfs_join_transaction(root, 1); 5749 trans = btrfs_join_transaction(root);
5566 if (!trans) { 5750 if (IS_ERR(trans)) {
5567 err = -ENOMEM; 5751 err = -ENOMEM;
5568 goto out; 5752 goto out;
5569 } 5753 }
@@ -5609,8 +5793,10 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
5609 } 5793 }
5610 5794
5611 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5795 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5612 btrfs_ordered_update_i_size(inode, 0, ordered); 5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5613 btrfs_update_inode(trans, root, inode); 5797 if (!ret)
5798 btrfs_update_inode(trans, root, inode);
5799 ret = 0;
5614out_unlock: 5800out_unlock:
5615 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5801 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
5616 ordered->file_offset + ordered->len - 1, 5802 ordered->file_offset + ordered->len - 1,
@@ -5618,13 +5804,29 @@ out_unlock:
5618out: 5804out:
5619 btrfs_delalloc_release_metadata(inode, ordered->len); 5805 btrfs_delalloc_release_metadata(inode, ordered->len);
5620 btrfs_end_transaction(trans, root); 5806 btrfs_end_transaction(trans, root);
5807 ordered_offset = ordered->file_offset + ordered->len;
5621 btrfs_put_ordered_extent(ordered); 5808 btrfs_put_ordered_extent(ordered);
5622 btrfs_put_ordered_extent(ordered); 5809 btrfs_put_ordered_extent(ordered);
5810
5811out_test:
5812 /*
5813 * our bio might span multiple ordered extents. If we haven't
5814 * completed the accounting for the whole dio, go back and try again
5815 */
5816 if (ordered_offset < dip->logical_offset + dip->bytes) {
5817 ordered_bytes = dip->logical_offset + dip->bytes -
5818 ordered_offset;
5819 goto again;
5820 }
5623out_done: 5821out_done:
5624 bio->bi_private = dip->private; 5822 bio->bi_private = dip->private;
5625 5823
5626 kfree(dip->csums); 5824 kfree(dip->csums);
5627 kfree(dip); 5825 kfree(dip);
5826
5827 /* If we had an error make sure to clear the uptodate flag */
5828 if (err)
5829 clear_bit(BIO_UPTODATE, &bio->bi_flags);
5628 dio_end_io(bio, err); 5830 dio_end_io(bio, err);
5629} 5831}
5630 5832
@@ -5639,13 +5841,207 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
5639 return 0; 5841 return 0;
5640} 5842}
5641 5843
5844static void btrfs_end_dio_bio(struct bio *bio, int err)
5845{
5846 struct btrfs_dio_private *dip = bio->bi_private;
5847
5848 if (err) {
5849 printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
5850 "sector %#Lx len %u err no %d\n",
5851 (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
5852 (unsigned long long)bio->bi_sector, bio->bi_size, err);
5853 dip->errors = 1;
5854
5855 /*
5856 * before atomic variable goto zero, we must make sure
5857 * dip->errors is perceived to be set.
5858 */
5859 smp_mb__before_atomic_dec();
5860 }
5861
5862 /* if there are more bios still pending for this dio, just exit */
5863 if (!atomic_dec_and_test(&dip->pending_bios))
5864 goto out;
5865
5866 if (dip->errors)
5867 bio_io_error(dip->orig_bio);
5868 else {
5869 set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
5870 bio_endio(dip->orig_bio, 0);
5871 }
5872out:
5873 bio_put(bio);
5874}
5875
5876static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
5877 u64 first_sector, gfp_t gfp_flags)
5878{
5879 int nr_vecs = bio_get_nr_vecs(bdev);
5880 return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
5881}
5882
5883static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
5884 int rw, u64 file_offset, int skip_sum,
5885 u32 *csums, int async_submit)
5886{
5887 int write = rw & REQ_WRITE;
5888 struct btrfs_root *root = BTRFS_I(inode)->root;
5889 int ret;
5890
5891 bio_get(bio);
5892 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
5893 if (ret)
5894 goto err;
5895
5896 if (skip_sum)
5897 goto map;
5898
5899 if (write && async_submit) {
5900 ret = btrfs_wq_submit_bio(root->fs_info,
5901 inode, rw, bio, 0, 0,
5902 file_offset,
5903 __btrfs_submit_bio_start_direct_io,
5904 __btrfs_submit_bio_done);
5905 goto err;
5906 } else if (write) {
5907 /*
5908 * If we aren't doing async submit, calculate the csum of the
5909 * bio now.
5910 */
5911 ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
5912 if (ret)
5913 goto err;
5914 } else if (!skip_sum) {
5915 ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
5916 file_offset, csums);
5917 if (ret)
5918 goto err;
5919 }
5920
5921map:
5922 ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
5923err:
5924 bio_put(bio);
5925 return ret;
5926}
5927
5928static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
5929 int skip_sum)
5930{
5931 struct inode *inode = dip->inode;
5932 struct btrfs_root *root = BTRFS_I(inode)->root;
5933 struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
5934 struct bio *bio;
5935 struct bio *orig_bio = dip->orig_bio;
5936 struct bio_vec *bvec = orig_bio->bi_io_vec;
5937 u64 start_sector = orig_bio->bi_sector;
5938 u64 file_offset = dip->logical_offset;
5939 u64 submit_len = 0;
5940 u64 map_length;
5941 int nr_pages = 0;
5942 u32 *csums = dip->csums;
5943 int ret = 0;
5944 int async_submit = 0;
5945 int write = rw & REQ_WRITE;
5946
5947 map_length = orig_bio->bi_size;
5948 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
5949 &map_length, NULL, 0);
5950 if (ret) {
5951 bio_put(orig_bio);
5952 return -EIO;
5953 }
5954
5955 if (map_length >= orig_bio->bi_size) {
5956 bio = orig_bio;
5957 goto submit;
5958 }
5959
5960 async_submit = 1;
5961 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
5962 if (!bio)
5963 return -ENOMEM;
5964 bio->bi_private = dip;
5965 bio->bi_end_io = btrfs_end_dio_bio;
5966 atomic_inc(&dip->pending_bios);
5967
5968 while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
5969 if (unlikely(map_length < submit_len + bvec->bv_len ||
5970 bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5971 bvec->bv_offset) < bvec->bv_len)) {
5972 /*
5973 * inc the count before we submit the bio so
5974 * we know the end IO handler won't happen before
5975 * we inc the count. Otherwise, the dip might get freed
5976 * before we're done setting it up
5977 */
5978 atomic_inc(&dip->pending_bios);
5979 ret = __btrfs_submit_dio_bio(bio, inode, rw,
5980 file_offset, skip_sum,
5981 csums, async_submit);
5982 if (ret) {
5983 bio_put(bio);
5984 atomic_dec(&dip->pending_bios);
5985 goto out_err;
5986 }
5987
5988 /* Write's use the ordered csums */
5989 if (!write && !skip_sum)
5990 csums = csums + nr_pages;
5991 start_sector += submit_len >> 9;
5992 file_offset += submit_len;
5993
5994 submit_len = 0;
5995 nr_pages = 0;
5996
5997 bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
5998 start_sector, GFP_NOFS);
5999 if (!bio)
6000 goto out_err;
6001 bio->bi_private = dip;
6002 bio->bi_end_io = btrfs_end_dio_bio;
6003
6004 map_length = orig_bio->bi_size;
6005 ret = btrfs_map_block(map_tree, READ, start_sector << 9,
6006 &map_length, NULL, 0);
6007 if (ret) {
6008 bio_put(bio);
6009 goto out_err;
6010 }
6011 } else {
6012 submit_len += bvec->bv_len;
6013 nr_pages ++;
6014 bvec++;
6015 }
6016 }
6017
6018submit:
6019 ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
6020 csums, async_submit);
6021 if (!ret)
6022 return 0;
6023
6024 bio_put(bio);
6025out_err:
6026 dip->errors = 1;
6027 /*
6028 * before atomic variable goto zero, we must
6029 * make sure dip->errors is perceived to be set.
6030 */
6031 smp_mb__before_atomic_dec();
6032 if (atomic_dec_and_test(&dip->pending_bios))
6033 bio_io_error(dip->orig_bio);
6034
6035 /* bio_end_io() will handle error, so we needn't return it */
6036 return 0;
6037}
6038
5642static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode, 6039static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5643 loff_t file_offset) 6040 loff_t file_offset)
5644{ 6041{
5645 struct btrfs_root *root = BTRFS_I(inode)->root; 6042 struct btrfs_root *root = BTRFS_I(inode)->root;
5646 struct btrfs_dio_private *dip; 6043 struct btrfs_dio_private *dip;
5647 struct bio_vec *bvec = bio->bi_io_vec; 6044 struct bio_vec *bvec = bio->bi_io_vec;
5648 u64 start;
5649 int skip_sum; 6045 int skip_sum;
5650 int write = rw & REQ_WRITE; 6046 int write = rw & REQ_WRITE;
5651 int ret = 0; 6047 int ret = 0;
@@ -5659,9 +6055,11 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5659 } 6055 }
5660 dip->csums = NULL; 6056 dip->csums = NULL;
5661 6057
5662 if (!skip_sum) { 6058 /* Write's use the ordered csum stuff, so we don't need dip->csums */
6059 if (!write && !skip_sum) {
5663 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS); 6060 dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
5664 if (!dip->csums) { 6061 if (!dip->csums) {
6062 kfree(dip);
5665 ret = -ENOMEM; 6063 ret = -ENOMEM;
5666 goto free_ordered; 6064 goto free_ordered;
5667 } 6065 }
@@ -5671,7 +6069,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5671 dip->inode = inode; 6069 dip->inode = inode;
5672 dip->logical_offset = file_offset; 6070 dip->logical_offset = file_offset;
5673 6071
5674 start = dip->logical_offset;
5675 dip->bytes = 0; 6072 dip->bytes = 0;
5676 do { 6073 do {
5677 dip->bytes += bvec->bv_len; 6074 dip->bytes += bvec->bv_len;
@@ -5680,36 +6077,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
5680 6077
5681 dip->disk_bytenr = (u64)bio->bi_sector << 9; 6078 dip->disk_bytenr = (u64)bio->bi_sector << 9;
5682 bio->bi_private = dip; 6079 bio->bi_private = dip;
6080 dip->errors = 0;
6081 dip->orig_bio = bio;
6082 atomic_set(&dip->pending_bios, 0);
5683 6083
5684 if (write) 6084 if (write)
5685 bio->bi_end_io = btrfs_endio_direct_write; 6085 bio->bi_end_io = btrfs_endio_direct_write;
5686 else 6086 else
5687 bio->bi_end_io = btrfs_endio_direct_read; 6087 bio->bi_end_io = btrfs_endio_direct_read;
5688 6088
5689 ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); 6089 ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
5690 if (ret) 6090 if (!ret)
5691 goto out_err;
5692
5693 if (write && !skip_sum) {
5694 ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
5695 inode, rw, bio, 0, 0,
5696 dip->logical_offset,
5697 __btrfs_submit_bio_start_direct_io,
5698 __btrfs_submit_bio_done);
5699 if (ret)
5700 goto out_err;
5701 return; 6091 return;
5702 } else if (!skip_sum)
5703 btrfs_lookup_bio_sums_dio(root, inode, bio,
5704 dip->logical_offset, dip->csums);
5705
5706 ret = btrfs_map_bio(root, rw, bio, 0, 1);
5707 if (ret)
5708 goto out_err;
5709 return;
5710out_err:
5711 kfree(dip->csums);
5712 kfree(dip);
5713free_ordered: 6092free_ordered:
5714 /* 6093 /*
5715 * If this is a write, we need to clean up the reserved space and kill 6094 * If this is a write, we need to clean up the reserved space and kill
@@ -5717,8 +6096,7 @@ free_ordered:
5717 */ 6096 */
5718 if (write) { 6097 if (write) {
5719 struct btrfs_ordered_extent *ordered; 6098 struct btrfs_ordered_extent *ordered;
5720 ordered = btrfs_lookup_ordered_extent(inode, 6099 ordered = btrfs_lookup_ordered_extent(inode, file_offset);
5721 dip->logical_offset);
5722 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && 6100 if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
5723 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) 6101 !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
5724 btrfs_free_reserved_extent(root, ordered->start, 6102 btrfs_free_reserved_extent(root, ordered->start,
@@ -5734,6 +6112,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
5734 unsigned long nr_segs) 6112 unsigned long nr_segs)
5735{ 6113{
5736 int seg; 6114 int seg;
6115 int i;
5737 size_t size; 6116 size_t size;
5738 unsigned long addr; 6117 unsigned long addr;
5739 unsigned blocksize_mask = root->sectorsize - 1; 6118 unsigned blocksize_mask = root->sectorsize - 1;
@@ -5748,8 +6127,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
5748 addr = (unsigned long)iov[seg].iov_base; 6127 addr = (unsigned long)iov[seg].iov_base;
5749 size = iov[seg].iov_len; 6128 size = iov[seg].iov_len;
5750 end += size; 6129 end += size;
5751 if ((addr & blocksize_mask) || (size & blocksize_mask)) 6130 if ((addr & blocksize_mask) || (size & blocksize_mask))
5752 goto out; 6131 goto out;
6132
6133 /* If this is a write we don't need to check anymore */
6134 if (rw & WRITE)
6135 continue;
6136
6137 /*
6138 * Check to make sure we don't have duplicate iov_base's in this
6139 * iovec, if so return EINVAL, otherwise we'll get csum errors
6140 * when reading back.
6141 */
6142 for (i = seg + 1; i < nr_segs; i++) {
6143 if (iov[seg].iov_base == iov[i].iov_base)
6144 goto out;
6145 }
5753 } 6146 }
5754 retval = 0; 6147 retval = 0;
5755out: 6148out:
@@ -5850,7 +6243,7 @@ out:
5850static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 6243static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
5851 __u64 start, __u64 len) 6244 __u64 start, __u64 len)
5852{ 6245{
5853 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent); 6246 return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
5854} 6247}
5855 6248
5856int btrfs_readpage(struct file *file, struct page *page) 6249int btrfs_readpage(struct file *file, struct page *page)
@@ -6100,30 +6493,97 @@ out:
6100 return ret; 6493 return ret;
6101} 6494}
6102 6495
6103static void btrfs_truncate(struct inode *inode) 6496static int btrfs_truncate(struct inode *inode)
6104{ 6497{
6105 struct btrfs_root *root = BTRFS_I(inode)->root; 6498 struct btrfs_root *root = BTRFS_I(inode)->root;
6499 struct btrfs_block_rsv *rsv;
6106 int ret; 6500 int ret;
6501 int err = 0;
6107 struct btrfs_trans_handle *trans; 6502 struct btrfs_trans_handle *trans;
6108 unsigned long nr; 6503 unsigned long nr;
6109 u64 mask = root->sectorsize - 1; 6504 u64 mask = root->sectorsize - 1;
6110 6505
6111 if (!S_ISREG(inode->i_mode)) {
6112 WARN_ON(1);
6113 return;
6114 }
6115
6116 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6506 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6117 if (ret) 6507 if (ret)
6118 return; 6508 return ret;
6119 6509
6120 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); 6510 btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
6121 btrfs_ordered_update_i_size(inode, inode->i_size, NULL); 6511 btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
6122 6512
6123 trans = btrfs_start_transaction(root, 0); 6513 /*
6124 BUG_ON(IS_ERR(trans)); 6514 * Yes ladies and gentelment, this is indeed ugly. The fact is we have
6125 btrfs_set_trans_block_group(trans, inode); 6515 * 3 things going on here
6126 trans->block_rsv = root->orphan_block_rsv; 6516 *
6517 * 1) We need to reserve space for our orphan item and the space to
6518 * delete our orphan item. Lord knows we don't want to have a dangling
6519 * orphan item because we didn't reserve space to remove it.
6520 *
6521 * 2) We need to reserve space to update our inode.
6522 *
6523 * 3) We need to have something to cache all the space that is going to
6524 * be free'd up by the truncate operation, but also have some slack
6525 * space reserved in case it uses space during the truncate (thank you
6526 * very much snapshotting).
6527 *
6528 * And we need these to all be seperate. The fact is we can use alot of
6529 * space doing the truncate, and we have no earthly idea how much space
6530 * we will use, so we need the truncate reservation to be seperate so it
6531 * doesn't end up using space reserved for updating the inode or
6532 * removing the orphan item. We also need to be able to stop the
6533 * transaction and start a new one, which means we need to be able to
6534 * update the inode several times, and we have no idea of knowing how
6535 * many times that will be, so we can't just reserve 1 item for the
6536 * entirety of the opration, so that has to be done seperately as well.
6537 * Then there is the orphan item, which does indeed need to be held on
6538 * to for the whole operation, and we need nobody to touch this reserved
6539 * space except the orphan code.
6540 *
6541 * So that leaves us with
6542 *
6543 * 1) root->orphan_block_rsv - for the orphan deletion.
6544 * 2) rsv - for the truncate reservation, which we will steal from the
6545 * transaction reservation.
6546 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
6547 * updating the inode.
6548 */
6549 rsv = btrfs_alloc_block_rsv(root);
6550 if (!rsv)
6551 return -ENOMEM;
6552 btrfs_add_durable_block_rsv(root->fs_info, rsv);
6553
6554 trans = btrfs_start_transaction(root, 4);
6555 if (IS_ERR(trans)) {
6556 err = PTR_ERR(trans);
6557 goto out;
6558 }
6559
6560 /*
6561 * Reserve space for the truncate process. Truncate should be adding
6562 * space, but if there are snapshots it may end up using space.
6563 */
6564 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6565 BUG_ON(ret);
6566
6567 ret = btrfs_orphan_add(trans, inode);
6568 if (ret) {
6569 btrfs_end_transaction(trans, root);
6570 goto out;
6571 }
6572
6573 nr = trans->blocks_used;
6574 btrfs_end_transaction(trans, root);
6575 btrfs_btree_balance_dirty(root, nr);
6576
6577 /*
6578 * Ok so we've already migrated our bytes over for the truncate, so here
6579 * just reserve the one slot we need for updating the inode.
6580 */
6581 trans = btrfs_start_transaction(root, 1);
6582 if (IS_ERR(trans)) {
6583 err = PTR_ERR(trans);
6584 goto out;
6585 }
6586 trans->block_rsv = rsv;
6127 6587
6128 /* 6588 /*
6129 * setattr is responsible for setting the ordered_data_close flag, 6589 * setattr is responsible for setting the ordered_data_close flag,
@@ -6147,30 +6607,33 @@ static void btrfs_truncate(struct inode *inode)
6147 6607
6148 while (1) { 6608 while (1) {
6149 if (!trans) { 6609 if (!trans) {
6150 trans = btrfs_start_transaction(root, 0); 6610 trans = btrfs_start_transaction(root, 3);
6151 BUG_ON(IS_ERR(trans)); 6611 if (IS_ERR(trans)) {
6152 btrfs_set_trans_block_group(trans, inode); 6612 err = PTR_ERR(trans);
6153 trans->block_rsv = root->orphan_block_rsv; 6613 goto out;
6154 } 6614 }
6155 6615
6156 ret = btrfs_block_rsv_check(trans, root, 6616 ret = btrfs_truncate_reserve_metadata(trans, root,
6157 root->orphan_block_rsv, 0, 5); 6617 rsv);
6158 if (ret) {
6159 BUG_ON(ret != -EAGAIN);
6160 ret = btrfs_commit_transaction(trans, root);
6161 BUG_ON(ret); 6618 BUG_ON(ret);
6162 trans = NULL; 6619
6163 continue; 6620 trans->block_rsv = rsv;
6164 } 6621 }
6165 6622
6166 ret = btrfs_truncate_inode_items(trans, root, inode, 6623 ret = btrfs_truncate_inode_items(trans, root, inode,
6167 inode->i_size, 6624 inode->i_size,
6168 BTRFS_EXTENT_DATA_KEY); 6625 BTRFS_EXTENT_DATA_KEY);
6169 if (ret != -EAGAIN) 6626 if (ret != -EAGAIN) {
6627 err = ret;
6170 break; 6628 break;
6629 }
6171 6630
6631 trans->block_rsv = &root->fs_info->trans_block_rsv;
6172 ret = btrfs_update_inode(trans, root, inode); 6632 ret = btrfs_update_inode(trans, root, inode);
6173 BUG_ON(ret); 6633 if (ret) {
6634 err = ret;
6635 break;
6636 }
6174 6637
6175 nr = trans->blocks_used; 6638 nr = trans->blocks_used;
6176 btrfs_end_transaction(trans, root); 6639 btrfs_end_transaction(trans, root);
@@ -6179,32 +6642,48 @@ static void btrfs_truncate(struct inode *inode)
6179 } 6642 }
6180 6643
6181 if (ret == 0 && inode->i_nlink > 0) { 6644 if (ret == 0 && inode->i_nlink > 0) {
6645 trans->block_rsv = root->orphan_block_rsv;
6182 ret = btrfs_orphan_del(trans, inode); 6646 ret = btrfs_orphan_del(trans, inode);
6183 BUG_ON(ret); 6647 if (ret)
6648 err = ret;
6649 } else if (ret && inode->i_nlink > 0) {
6650 /*
6651 * Failed to do the truncate, remove us from the in memory
6652 * orphan list.
6653 */
6654 ret = btrfs_orphan_del(NULL, inode);
6184 } 6655 }
6185 6656
6657 trans->block_rsv = &root->fs_info->trans_block_rsv;
6186 ret = btrfs_update_inode(trans, root, inode); 6658 ret = btrfs_update_inode(trans, root, inode);
6187 BUG_ON(ret); 6659 if (ret && !err)
6660 err = ret;
6188 6661
6189 nr = trans->blocks_used; 6662 nr = trans->blocks_used;
6190 ret = btrfs_end_transaction_throttle(trans, root); 6663 ret = btrfs_end_transaction_throttle(trans, root);
6191 BUG_ON(ret);
6192 btrfs_btree_balance_dirty(root, nr); 6664 btrfs_btree_balance_dirty(root, nr);
6665
6666out:
6667 btrfs_free_block_rsv(root, rsv);
6668
6669 if (ret && !err)
6670 err = ret;
6671
6672 return err;
6193} 6673}
6194 6674
6195/* 6675/*
6196 * create a new subvolume directory/inode (helper for the ioctl). 6676 * create a new subvolume directory/inode (helper for the ioctl).
6197 */ 6677 */
6198int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 6678int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
6199 struct btrfs_root *new_root, 6679 struct btrfs_root *new_root, u64 new_dirid)
6200 u64 new_dirid, u64 alloc_hint)
6201{ 6680{
6202 struct inode *inode; 6681 struct inode *inode;
6203 int err; 6682 int err;
6204 u64 index = 0; 6683 u64 index = 0;
6205 6684
6206 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, 6685 inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
6207 new_dirid, alloc_hint, S_IFDIR | 0700, &index); 6686 new_dirid, S_IFDIR | 0700, &index);
6208 if (IS_ERR(inode)) 6687 if (IS_ERR(inode))
6209 return PTR_ERR(inode); 6688 return PTR_ERR(inode);
6210 inode->i_op = &btrfs_dir_inode_operations; 6689 inode->i_op = &btrfs_dir_inode_operations;
@@ -6256,19 +6735,21 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6256 ei->index_cnt = (u64)-1; 6735 ei->index_cnt = (u64)-1;
6257 ei->last_unlink_trans = 0; 6736 ei->last_unlink_trans = 0;
6258 6737
6259 spin_lock_init(&ei->accounting_lock);
6260 atomic_set(&ei->outstanding_extents, 0); 6738 atomic_set(&ei->outstanding_extents, 0);
6261 ei->reserved_extents = 0; 6739 atomic_set(&ei->reserved_extents, 0);
6262 6740
6263 ei->ordered_data_close = 0; 6741 ei->ordered_data_close = 0;
6264 ei->orphan_meta_reserved = 0; 6742 ei->orphan_meta_reserved = 0;
6265 ei->dummy_inode = 0; 6743 ei->dummy_inode = 0;
6266 ei->force_compress = 0; 6744 ei->in_defrag = 0;
6745 ei->force_compress = BTRFS_COMPRESS_NONE;
6746
6747 ei->delayed_node = NULL;
6267 6748
6268 inode = &ei->vfs_inode; 6749 inode = &ei->vfs_inode;
6269 extent_map_tree_init(&ei->extent_tree, GFP_NOFS); 6750 extent_map_tree_init(&ei->extent_tree);
6270 extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS); 6751 extent_io_tree_init(&ei->io_tree, &inode->i_data);
6271 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS); 6752 extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
6272 mutex_init(&ei->log_mutex); 6753 mutex_init(&ei->log_mutex);
6273 btrfs_ordered_inode_tree_init(&ei->ordered_tree); 6754 btrfs_ordered_inode_tree_init(&ei->ordered_tree);
6274 INIT_LIST_HEAD(&ei->i_orphan); 6755 INIT_LIST_HEAD(&ei->i_orphan);
@@ -6279,6 +6760,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6279 return inode; 6760 return inode;
6280} 6761}
6281 6762
6763static void btrfs_i_callback(struct rcu_head *head)
6764{
6765 struct inode *inode = container_of(head, struct inode, i_rcu);
6766 INIT_LIST_HEAD(&inode->i_dentry);
6767 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
6768}
6769
6282void btrfs_destroy_inode(struct inode *inode) 6770void btrfs_destroy_inode(struct inode *inode)
6283{ 6771{
6284 struct btrfs_ordered_extent *ordered; 6772 struct btrfs_ordered_extent *ordered;
@@ -6287,7 +6775,7 @@ void btrfs_destroy_inode(struct inode *inode)
6287 WARN_ON(!list_empty(&inode->i_dentry)); 6775 WARN_ON(!list_empty(&inode->i_dentry));
6288 WARN_ON(inode->i_data.nrpages); 6776 WARN_ON(inode->i_data.nrpages);
6289 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents)); 6777 WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
6290 WARN_ON(BTRFS_I(inode)->reserved_extents); 6778 WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
6291 6779
6292 /* 6780 /*
6293 * This can happen where we create an inode, but somebody else also 6781 * This can happen where we create an inode, but somebody else also
@@ -6310,8 +6798,8 @@ void btrfs_destroy_inode(struct inode *inode)
6310 6798
6311 spin_lock(&root->orphan_lock); 6799 spin_lock(&root->orphan_lock);
6312 if (!list_empty(&BTRFS_I(inode)->i_orphan)) { 6800 if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
6313 printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", 6801 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
6314 inode->i_ino); 6802 (unsigned long long)btrfs_ino(inode));
6315 list_del_init(&BTRFS_I(inode)->i_orphan); 6803 list_del_init(&BTRFS_I(inode)->i_orphan);
6316 } 6804 }
6317 spin_unlock(&root->orphan_lock); 6805 spin_unlock(&root->orphan_lock);
@@ -6333,14 +6821,16 @@ void btrfs_destroy_inode(struct inode *inode)
6333 inode_tree_del(inode); 6821 inode_tree_del(inode);
6334 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); 6822 btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
6335free: 6823free:
6336 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); 6824 btrfs_remove_delayed_node(inode);
6825 call_rcu(&inode->i_rcu, btrfs_i_callback);
6337} 6826}
6338 6827
6339int btrfs_drop_inode(struct inode *inode) 6828int btrfs_drop_inode(struct inode *inode)
6340{ 6829{
6341 struct btrfs_root *root = BTRFS_I(inode)->root; 6830 struct btrfs_root *root = BTRFS_I(inode)->root;
6342 6831
6343 if (btrfs_root_refs(&root->root_item) == 0) 6832 if (btrfs_root_refs(&root->root_item) == 0 &&
6833 !is_free_space_inode(root, inode))
6344 return 1; 6834 return 1;
6345 else 6835 else
6346 return generic_drop_inode(inode); 6836 return generic_drop_inode(inode);
@@ -6363,6 +6853,8 @@ void btrfs_destroy_cachep(void)
6363 kmem_cache_destroy(btrfs_transaction_cachep); 6853 kmem_cache_destroy(btrfs_transaction_cachep);
6364 if (btrfs_path_cachep) 6854 if (btrfs_path_cachep)
6365 kmem_cache_destroy(btrfs_path_cachep); 6855 kmem_cache_destroy(btrfs_path_cachep);
6856 if (btrfs_free_space_cachep)
6857 kmem_cache_destroy(btrfs_free_space_cachep);
6366} 6858}
6367 6859
6368int btrfs_init_cachep(void) 6860int btrfs_init_cachep(void)
@@ -6391,6 +6883,12 @@ int btrfs_init_cachep(void)
6391 if (!btrfs_path_cachep) 6883 if (!btrfs_path_cachep)
6392 goto fail; 6884 goto fail;
6393 6885
6886 btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
6887 sizeof(struct btrfs_free_space), 0,
6888 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
6889 if (!btrfs_free_space_cachep)
6890 goto fail;
6891
6394 return 0; 6892 return 0;
6395fail: 6893fail:
6396 btrfs_destroy_cachep(); 6894 btrfs_destroy_cachep();
@@ -6409,6 +6907,26 @@ static int btrfs_getattr(struct vfsmount *mnt,
6409 return 0; 6907 return 0;
6410} 6908}
6411 6909
6910/*
6911 * If a file is moved, it will inherit the cow and compression flags of the new
6912 * directory.
6913 */
6914static void fixup_inode_flags(struct inode *dir, struct inode *inode)
6915{
6916 struct btrfs_inode *b_dir = BTRFS_I(dir);
6917 struct btrfs_inode *b_inode = BTRFS_I(inode);
6918
6919 if (b_dir->flags & BTRFS_INODE_NODATACOW)
6920 b_inode->flags |= BTRFS_INODE_NODATACOW;
6921 else
6922 b_inode->flags &= ~BTRFS_INODE_NODATACOW;
6923
6924 if (b_dir->flags & BTRFS_INODE_COMPRESS)
6925 b_inode->flags |= BTRFS_INODE_COMPRESS;
6926 else
6927 b_inode->flags &= ~BTRFS_INODE_COMPRESS;
6928}
6929
6412static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, 6930static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6413 struct inode *new_dir, struct dentry *new_dentry) 6931 struct inode *new_dir, struct dentry *new_dentry)
6414{ 6932{
@@ -6421,16 +6939,17 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6421 u64 index = 0; 6939 u64 index = 0;
6422 u64 root_objectid; 6940 u64 root_objectid;
6423 int ret; 6941 int ret;
6942 u64 old_ino = btrfs_ino(old_inode);
6424 6943
6425 if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) 6944 if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
6426 return -EPERM; 6945 return -EPERM;
6427 6946
6428 /* we only allow rename subvolume link between subvolumes */ 6947 /* we only allow rename subvolume link between subvolumes */
6429 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) 6948 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
6430 return -EXDEV; 6949 return -EXDEV;
6431 6950
6432 if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID || 6951 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
6433 (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) 6952 (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
6434 return -ENOTEMPTY; 6953 return -ENOTEMPTY;
6435 6954
6436 if (S_ISDIR(old_inode->i_mode) && new_inode && 6955 if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -6446,7 +6965,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6446 filemap_flush(old_inode->i_mapping); 6965 filemap_flush(old_inode->i_mapping);
6447 6966
6448 /* close the racy window with snapshot create/destroy ioctl */ 6967 /* close the racy window with snapshot create/destroy ioctl */
6449 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 6968 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
6450 down_read(&root->fs_info->subvol_sem); 6969 down_read(&root->fs_info->subvol_sem);
6451 /* 6970 /*
6452 * We want to reserve the absolute worst case amount of items. So if 6971 * We want to reserve the absolute worst case amount of items. So if
@@ -6457,10 +6976,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6457 * should cover the worst case number of items we'll modify. 6976 * should cover the worst case number of items we'll modify.
6458 */ 6977 */
6459 trans = btrfs_start_transaction(root, 20); 6978 trans = btrfs_start_transaction(root, 20);
6460 if (IS_ERR(trans)) 6979 if (IS_ERR(trans)) {
6461 return PTR_ERR(trans); 6980 ret = PTR_ERR(trans);
6462 6981 goto out_notrans;
6463 btrfs_set_trans_block_group(trans, new_dir); 6982 }
6464 6983
6465 if (dest != root) 6984 if (dest != root)
6466 btrfs_record_root_in_trans(trans, dest); 6985 btrfs_record_root_in_trans(trans, dest);
@@ -6469,15 +6988,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6469 if (ret) 6988 if (ret)
6470 goto out_fail; 6989 goto out_fail;
6471 6990
6472 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 6991 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
6473 /* force full log commit if subvolume involved. */ 6992 /* force full log commit if subvolume involved. */
6474 root->fs_info->last_trans_log_full_commit = trans->transid; 6993 root->fs_info->last_trans_log_full_commit = trans->transid;
6475 } else { 6994 } else {
6476 ret = btrfs_insert_inode_ref(trans, dest, 6995 ret = btrfs_insert_inode_ref(trans, dest,
6477 new_dentry->d_name.name, 6996 new_dentry->d_name.name,
6478 new_dentry->d_name.len, 6997 new_dentry->d_name.len,
6479 old_inode->i_ino, 6998 old_ino,
6480 new_dir->i_ino, index); 6999 btrfs_ino(new_dir), index);
6481 if (ret) 7000 if (ret)
6482 goto out_fail; 7001 goto out_fail;
6483 /* 7002 /*
@@ -6493,10 +7012,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6493 * make sure the inode gets flushed if it is replacing 7012 * make sure the inode gets flushed if it is replacing
6494 * something. 7013 * something.
6495 */ 7014 */
6496 if (new_inode && new_inode->i_size && 7015 if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
6497 old_inode && S_ISREG(old_inode->i_mode)) {
6498 btrfs_add_ordered_operation(trans, root, old_inode); 7016 btrfs_add_ordered_operation(trans, root, old_inode);
6499 }
6500 7017
6501 old_dir->i_ctime = old_dir->i_mtime = ctime; 7018 old_dir->i_ctime = old_dir->i_mtime = ctime;
6502 new_dir->i_ctime = new_dir->i_mtime = ctime; 7019 new_dir->i_ctime = new_dir->i_mtime = ctime;
@@ -6505,23 +7022,24 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6505 if (old_dentry->d_parent != new_dentry->d_parent) 7022 if (old_dentry->d_parent != new_dentry->d_parent)
6506 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1); 7023 btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
6507 7024
6508 if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) { 7025 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
6509 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid; 7026 root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
6510 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid, 7027 ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
6511 old_dentry->d_name.name, 7028 old_dentry->d_name.name,
6512 old_dentry->d_name.len); 7029 old_dentry->d_name.len);
6513 } else { 7030 } else {
6514 btrfs_inc_nlink(old_dentry->d_inode); 7031 ret = __btrfs_unlink_inode(trans, root, old_dir,
6515 ret = btrfs_unlink_inode(trans, root, old_dir, 7032 old_dentry->d_inode,
6516 old_dentry->d_inode, 7033 old_dentry->d_name.name,
6517 old_dentry->d_name.name, 7034 old_dentry->d_name.len);
6518 old_dentry->d_name.len); 7035 if (!ret)
7036 ret = btrfs_update_inode(trans, root, old_inode);
6519 } 7037 }
6520 BUG_ON(ret); 7038 BUG_ON(ret);
6521 7039
6522 if (new_inode) { 7040 if (new_inode) {
6523 new_inode->i_ctime = CURRENT_TIME; 7041 new_inode->i_ctime = CURRENT_TIME;
6524 if (unlikely(new_inode->i_ino == 7042 if (unlikely(btrfs_ino(new_inode) ==
6525 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { 7043 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
6526 root_objectid = BTRFS_I(new_inode)->location.objectid; 7044 root_objectid = BTRFS_I(new_inode)->location.objectid;
6527 ret = btrfs_unlink_subvol(trans, dest, new_dir, 7045 ret = btrfs_unlink_subvol(trans, dest, new_dir,
@@ -6542,20 +7060,23 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
6542 } 7060 }
6543 } 7061 }
6544 7062
7063 fixup_inode_flags(new_dir, old_inode);
7064
6545 ret = btrfs_add_link(trans, new_dir, old_inode, 7065 ret = btrfs_add_link(trans, new_dir, old_inode,
6546 new_dentry->d_name.name, 7066 new_dentry->d_name.name,
6547 new_dentry->d_name.len, 0, index); 7067 new_dentry->d_name.len, 0, index);
6548 BUG_ON(ret); 7068 BUG_ON(ret);
6549 7069
6550 if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 7070 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
6551 btrfs_log_new_name(trans, old_inode, old_dir, 7071 struct dentry *parent = dget_parent(new_dentry);
6552 new_dentry->d_parent); 7072 btrfs_log_new_name(trans, old_inode, old_dir, parent);
7073 dput(parent);
6553 btrfs_end_log_trans(root); 7074 btrfs_end_log_trans(root);
6554 } 7075 }
6555out_fail: 7076out_fail:
6556 btrfs_end_transaction_throttle(trans, root); 7077 btrfs_end_transaction_throttle(trans, root);
6557 7078out_notrans:
6558 if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) 7079 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
6559 up_read(&root->fs_info->subvol_sem); 7080 up_read(&root->fs_info->subvol_sem);
6560 7081
6561 return ret; 7082 return ret;
@@ -6609,38 +7130,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
6609 return 0; 7130 return 0;
6610} 7131}
6611 7132
6612int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
6613{
6614 struct btrfs_inode *binode;
6615 struct inode *inode = NULL;
6616
6617 spin_lock(&root->fs_info->delalloc_lock);
6618 while (!list_empty(&root->fs_info->delalloc_inodes)) {
6619 binode = list_entry(root->fs_info->delalloc_inodes.next,
6620 struct btrfs_inode, delalloc_inodes);
6621 inode = igrab(&binode->vfs_inode);
6622 if (inode) {
6623 list_move_tail(&binode->delalloc_inodes,
6624 &root->fs_info->delalloc_inodes);
6625 break;
6626 }
6627
6628 list_del_init(&binode->delalloc_inodes);
6629 cond_resched_lock(&root->fs_info->delalloc_lock);
6630 }
6631 spin_unlock(&root->fs_info->delalloc_lock);
6632
6633 if (inode) {
6634 write_inode_now(inode, 0);
6635 if (delay_iput)
6636 btrfs_add_delayed_iput(inode);
6637 else
6638 iput(inode);
6639 return 1;
6640 }
6641 return 0;
6642}
6643
6644static int btrfs_symlink(struct inode *dir, struct dentry *dentry, 7133static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6645 const char *symname) 7134 const char *symname)
6646{ 7135{
@@ -6664,9 +7153,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6664 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) 7153 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
6665 return -ENAMETOOLONG; 7154 return -ENAMETOOLONG;
6666 7155
6667 err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
6668 if (err)
6669 return err;
6670 /* 7156 /*
6671 * 2 items for inode item and ref 7157 * 2 items for inode item and ref
6672 * 2 items for dir items 7158 * 2 items for dir items
@@ -6676,25 +7162,25 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6676 if (IS_ERR(trans)) 7162 if (IS_ERR(trans))
6677 return PTR_ERR(trans); 7163 return PTR_ERR(trans);
6678 7164
6679 btrfs_set_trans_block_group(trans, dir); 7165 err = btrfs_find_free_ino(root, &objectid);
7166 if (err)
7167 goto out_unlock;
6680 7168
6681 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 7169 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6682 dentry->d_name.len, 7170 dentry->d_name.len, btrfs_ino(dir), objectid,
6683 dentry->d_parent->d_inode->i_ino, objectid, 7171 S_IFLNK|S_IRWXUGO, &index);
6684 BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, 7172 if (IS_ERR(inode)) {
6685 &index); 7173 err = PTR_ERR(inode);
6686 err = PTR_ERR(inode);
6687 if (IS_ERR(inode))
6688 goto out_unlock; 7174 goto out_unlock;
7175 }
6689 7176
6690 err = btrfs_init_inode_security(trans, inode, dir); 7177 err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6691 if (err) { 7178 if (err) {
6692 drop_inode = 1; 7179 drop_inode = 1;
6693 goto out_unlock; 7180 goto out_unlock;
6694 } 7181 }
6695 7182
6696 btrfs_set_trans_block_group(trans, inode); 7183 err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
6697 err = btrfs_add_nondir(trans, dentry, inode, 0, index);
6698 if (err) 7184 if (err)
6699 drop_inode = 1; 7185 drop_inode = 1;
6700 else { 7186 else {
@@ -6704,14 +7190,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6704 inode->i_op = &btrfs_file_inode_operations; 7190 inode->i_op = &btrfs_file_inode_operations;
6705 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; 7191 BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6706 } 7192 }
6707 btrfs_update_inode_block_group(trans, inode);
6708 btrfs_update_inode_block_group(trans, dir);
6709 if (drop_inode) 7193 if (drop_inode)
6710 goto out_unlock; 7194 goto out_unlock;
6711 7195
6712 path = btrfs_alloc_path(); 7196 path = btrfs_alloc_path();
6713 BUG_ON(!path); 7197 BUG_ON(!path);
6714 key.objectid = inode->i_ino; 7198 key.objectid = btrfs_ino(inode);
6715 key.offset = 0; 7199 key.offset = 0;
6716 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); 7200 btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
6717 datasize = btrfs_file_extent_calc_inline_size(name_len); 7201 datasize = btrfs_file_extent_calc_inline_size(name_len);
@@ -6719,6 +7203,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
6719 datasize); 7203 datasize);
6720 if (err) { 7204 if (err) {
6721 drop_inode = 1; 7205 drop_inode = 1;
7206 btrfs_free_path(path);
6722 goto out_unlock; 7207 goto out_unlock;
6723 } 7208 }
6724 leaf = path->nodes[0]; 7209 leaf = path->nodes[0];
@@ -6757,27 +7242,34 @@ out_unlock:
6757 return err; 7242 return err;
6758} 7243}
6759 7244
6760int btrfs_prealloc_file_range(struct inode *inode, int mode, 7245static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
6761 u64 start, u64 num_bytes, u64 min_size, 7246 u64 start, u64 num_bytes, u64 min_size,
6762 loff_t actual_len, u64 *alloc_hint) 7247 loff_t actual_len, u64 *alloc_hint,
7248 struct btrfs_trans_handle *trans)
6763{ 7249{
6764 struct btrfs_trans_handle *trans;
6765 struct btrfs_root *root = BTRFS_I(inode)->root; 7250 struct btrfs_root *root = BTRFS_I(inode)->root;
6766 struct btrfs_key ins; 7251 struct btrfs_key ins;
6767 u64 cur_offset = start; 7252 u64 cur_offset = start;
7253 u64 i_size;
6768 int ret = 0; 7254 int ret = 0;
7255 bool own_trans = true;
6769 7256
7257 if (trans)
7258 own_trans = false;
6770 while (num_bytes > 0) { 7259 while (num_bytes > 0) {
6771 trans = btrfs_start_transaction(root, 3); 7260 if (own_trans) {
6772 if (IS_ERR(trans)) { 7261 trans = btrfs_start_transaction(root, 3);
6773 ret = PTR_ERR(trans); 7262 if (IS_ERR(trans)) {
6774 break; 7263 ret = PTR_ERR(trans);
7264 break;
7265 }
6775 } 7266 }
6776 7267
6777 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size, 7268 ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
6778 0, *alloc_hint, (u64)-1, &ins, 1); 7269 0, *alloc_hint, (u64)-1, &ins, 1);
6779 if (ret) { 7270 if (ret) {
6780 btrfs_end_transaction(trans, root); 7271 if (own_trans)
7272 btrfs_end_transaction(trans, root);
6781 break; 7273 break;
6782 } 7274 }
6783 7275
@@ -6800,121 +7292,38 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
6800 (actual_len > inode->i_size) && 7292 (actual_len > inode->i_size) &&
6801 (cur_offset > inode->i_size)) { 7293 (cur_offset > inode->i_size)) {
6802 if (cur_offset > actual_len) 7294 if (cur_offset > actual_len)
6803 i_size_write(inode, actual_len); 7295 i_size = actual_len;
6804 else 7296 else
6805 i_size_write(inode, cur_offset); 7297 i_size = cur_offset;
6806 i_size_write(inode, cur_offset); 7298 i_size_write(inode, i_size);
6807 btrfs_ordered_update_i_size(inode, cur_offset, NULL); 7299 btrfs_ordered_update_i_size(inode, i_size, NULL);
6808 } 7300 }
6809 7301
6810 ret = btrfs_update_inode(trans, root, inode); 7302 ret = btrfs_update_inode(trans, root, inode);
6811 BUG_ON(ret); 7303 BUG_ON(ret);
6812 7304
6813 btrfs_end_transaction(trans, root); 7305 if (own_trans)
7306 btrfs_end_transaction(trans, root);
6814 } 7307 }
6815 return ret; 7308 return ret;
6816} 7309}
6817 7310
6818static long btrfs_fallocate(struct inode *inode, int mode, 7311int btrfs_prealloc_file_range(struct inode *inode, int mode,
6819 loff_t offset, loff_t len) 7312 u64 start, u64 num_bytes, u64 min_size,
7313 loff_t actual_len, u64 *alloc_hint)
6820{ 7314{
6821 struct extent_state *cached_state = NULL; 7315 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
6822 u64 cur_offset; 7316 min_size, actual_len, alloc_hint,
6823 u64 last_byte; 7317 NULL);
6824 u64 alloc_start; 7318}
6825 u64 alloc_end;
6826 u64 alloc_hint = 0;
6827 u64 locked_end;
6828 u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
6829 struct extent_map *em;
6830 int ret;
6831
6832 alloc_start = offset & ~mask;
6833 alloc_end = (offset + len + mask) & ~mask;
6834
6835 /*
6836 * wait for ordered IO before we have any locks. We'll loop again
6837 * below with the locks held.
6838 */
6839 btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
6840
6841 mutex_lock(&inode->i_mutex);
6842 if (alloc_start > inode->i_size) {
6843 ret = btrfs_cont_expand(inode, alloc_start);
6844 if (ret)
6845 goto out;
6846 }
6847
6848 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
6849 if (ret)
6850 goto out;
6851
6852 locked_end = alloc_end - 1;
6853 while (1) {
6854 struct btrfs_ordered_extent *ordered;
6855
6856 /* the extent lock is ordered inside the running
6857 * transaction
6858 */
6859 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
6860 locked_end, 0, &cached_state, GFP_NOFS);
6861 ordered = btrfs_lookup_first_ordered_extent(inode,
6862 alloc_end - 1);
6863 if (ordered &&
6864 ordered->file_offset + ordered->len > alloc_start &&
6865 ordered->file_offset < alloc_end) {
6866 btrfs_put_ordered_extent(ordered);
6867 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
6868 alloc_start, locked_end,
6869 &cached_state, GFP_NOFS);
6870 /*
6871 * we can't wait on the range with the transaction
6872 * running or with the extent lock held
6873 */
6874 btrfs_wait_ordered_range(inode, alloc_start,
6875 alloc_end - alloc_start);
6876 } else {
6877 if (ordered)
6878 btrfs_put_ordered_extent(ordered);
6879 break;
6880 }
6881 }
6882
6883 cur_offset = alloc_start;
6884 while (1) {
6885 em = btrfs_get_extent(inode, NULL, 0, cur_offset,
6886 alloc_end - cur_offset, 0);
6887 BUG_ON(IS_ERR(em) || !em);
6888 last_byte = min(extent_map_end(em), alloc_end);
6889 last_byte = (last_byte + mask) & ~mask;
6890 if (em->block_start == EXTENT_MAP_HOLE ||
6891 (cur_offset >= inode->i_size &&
6892 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6893 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
6894 last_byte - cur_offset,
6895 1 << inode->i_blkbits,
6896 offset + len,
6897 &alloc_hint);
6898 if (ret < 0) {
6899 free_extent_map(em);
6900 break;
6901 }
6902 }
6903 free_extent_map(em);
6904
6905 cur_offset = last_byte;
6906 if (cur_offset >= alloc_end) {
6907 ret = 0;
6908 break;
6909 }
6910 }
6911 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
6912 &cached_state, GFP_NOFS);
6913 7319
6914 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start); 7320int btrfs_prealloc_file_range_trans(struct inode *inode,
6915out: 7321 struct btrfs_trans_handle *trans, int mode,
6916 mutex_unlock(&inode->i_mutex); 7322 u64 start, u64 num_bytes, u64 min_size,
6917 return ret; 7323 loff_t actual_len, u64 *alloc_hint)
7324{
7325 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
7326 min_size, actual_len, alloc_hint, trans);
6918} 7327}
6919 7328
6920static int btrfs_set_page_dirty(struct page *page) 7329static int btrfs_set_page_dirty(struct page *page)
@@ -6922,11 +7331,15 @@ static int btrfs_set_page_dirty(struct page *page)
6922 return __set_page_dirty_nobuffers(page); 7331 return __set_page_dirty_nobuffers(page);
6923} 7332}
6924 7333
6925static int btrfs_permission(struct inode *inode, int mask) 7334static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
6926{ 7335{
7336 struct btrfs_root *root = BTRFS_I(inode)->root;
7337
7338 if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
7339 return -EROFS;
6927 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE)) 7340 if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
6928 return -EACCES; 7341 return -EACCES;
6929 return generic_permission(inode, mask, btrfs_check_acl); 7342 return generic_permission(inode, mask, flags, btrfs_check_acl);
6930} 7343}
6931 7344
6932static const struct inode_operations btrfs_dir_inode_operations = { 7345static const struct inode_operations btrfs_dir_inode_operations = {
@@ -6995,7 +7408,6 @@ static const struct address_space_operations btrfs_aops = {
6995 .writepage = btrfs_writepage, 7408 .writepage = btrfs_writepage,
6996 .writepages = btrfs_writepages, 7409 .writepages = btrfs_writepages,
6997 .readpages = btrfs_readpages, 7410 .readpages = btrfs_readpages,
6998 .sync_page = block_sync_page,
6999 .direct_IO = btrfs_direct_IO, 7411 .direct_IO = btrfs_direct_IO,
7000 .invalidatepage = btrfs_invalidatepage, 7412 .invalidatepage = btrfs_invalidatepage,
7001 .releasepage = btrfs_releasepage, 7413 .releasepage = btrfs_releasepage,
@@ -7011,7 +7423,6 @@ static const struct address_space_operations btrfs_symlink_aops = {
7011}; 7423};
7012 7424
7013static const struct inode_operations btrfs_file_inode_operations = { 7425static const struct inode_operations btrfs_file_inode_operations = {
7014 .truncate = btrfs_truncate,
7015 .getattr = btrfs_getattr, 7426 .getattr = btrfs_getattr,
7016 .setattr = btrfs_setattr, 7427 .setattr = btrfs_setattr,
7017 .setxattr = btrfs_setxattr, 7428 .setxattr = btrfs_setxattr,
@@ -7019,7 +7430,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
7019 .listxattr = btrfs_listxattr, 7430 .listxattr = btrfs_listxattr,
7020 .removexattr = btrfs_removexattr, 7431 .removexattr = btrfs_removexattr,
7021 .permission = btrfs_permission, 7432 .permission = btrfs_permission,
7022 .fallocate = btrfs_fallocate,
7023 .fiemap = btrfs_fiemap, 7433 .fiemap = btrfs_fiemap,
7024}; 7434};
7025static const struct inode_operations btrfs_special_inode_operations = { 7435static const struct inode_operations btrfs_special_inode_operations = {
@@ -7035,6 +7445,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
7035 .readlink = generic_readlink, 7445 .readlink = generic_readlink,
7036 .follow_link = page_follow_link_light, 7446 .follow_link = page_follow_link_light,
7037 .put_link = page_put_link, 7447 .put_link = page_put_link,
7448 .getattr = btrfs_getattr,
7038 .permission = btrfs_permission, 7449 .permission = btrfs_permission,
7039 .setxattr = btrfs_setxattr, 7450 .setxattr = btrfs_setxattr,
7040 .getxattr = btrfs_getxattr, 7451 .getxattr = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9254b3d58dbe..a3c4751e07db 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -40,6 +40,7 @@
40#include <linux/xattr.h> 40#include <linux/xattr.h>
41#include <linux/vmalloc.h> 41#include <linux/vmalloc.h>
42#include <linux/slab.h> 42#include <linux/slab.h>
43#include <linux/blkdev.h>
43#include "compat.h" 44#include "compat.h"
44#include "ctree.h" 45#include "ctree.h"
45#include "disk-io.h" 46#include "disk-io.h"
@@ -49,6 +50,7 @@
49#include "print-tree.h" 50#include "print-tree.h"
50#include "volumes.h" 51#include "volumes.h"
51#include "locking.h" 52#include "locking.h"
53#include "inode-map.h"
52 54
53/* Mask out flags that are inappropriate for the given type of inode. */ 55/* Mask out flags that are inappropriate for the given type of inode. */
54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -80,6 +82,13 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
80 iflags |= FS_NOATIME_FL; 82 iflags |= FS_NOATIME_FL;
81 if (flags & BTRFS_INODE_DIRSYNC) 83 if (flags & BTRFS_INODE_DIRSYNC)
82 iflags |= FS_DIRSYNC_FL; 84 iflags |= FS_DIRSYNC_FL;
85 if (flags & BTRFS_INODE_NODATACOW)
86 iflags |= FS_NOCOW_FL;
87
88 if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
89 iflags |= FS_COMPR_FL;
90 else if (flags & BTRFS_INODE_NOCOMPRESS)
91 iflags |= FS_NOCOMP_FL;
83 92
84 return iflags; 93 return iflags;
85} 94}
@@ -138,6 +147,21 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
138 return 0; 147 return 0;
139} 148}
140 149
150static int check_flags(unsigned int flags)
151{
152 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
153 FS_NOATIME_FL | FS_NODUMP_FL | \
154 FS_SYNC_FL | FS_DIRSYNC_FL | \
155 FS_NOCOMP_FL | FS_COMPR_FL |
156 FS_NOCOW_FL))
157 return -EOPNOTSUPP;
158
159 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
160 return -EINVAL;
161
162 return 0;
163}
164
141static int btrfs_ioctl_setflags(struct file *file, void __user *arg) 165static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
142{ 166{
143 struct inode *inode = file->f_path.dentry->d_inode; 167 struct inode *inode = file->f_path.dentry->d_inode;
@@ -147,15 +171,17 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
147 unsigned int flags, oldflags; 171 unsigned int flags, oldflags;
148 int ret; 172 int ret;
149 173
174 if (btrfs_root_readonly(root))
175 return -EROFS;
176
150 if (copy_from_user(&flags, arg, sizeof(flags))) 177 if (copy_from_user(&flags, arg, sizeof(flags)))
151 return -EFAULT; 178 return -EFAULT;
152 179
153 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 180 ret = check_flags(flags);
154 FS_NOATIME_FL | FS_NODUMP_FL | \ 181 if (ret)
155 FS_SYNC_FL | FS_DIRSYNC_FL)) 182 return ret;
156 return -EOPNOTSUPP;
157 183
158 if (!is_owner_or_cap(inode)) 184 if (!inode_owner_or_capable(inode))
159 return -EACCES; 185 return -EACCES;
160 186
161 mutex_lock(&inode->i_mutex); 187 mutex_lock(&inode->i_mutex);
@@ -197,10 +223,28 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
197 ip->flags |= BTRFS_INODE_DIRSYNC; 223 ip->flags |= BTRFS_INODE_DIRSYNC;
198 else 224 else
199 ip->flags &= ~BTRFS_INODE_DIRSYNC; 225 ip->flags &= ~BTRFS_INODE_DIRSYNC;
226 if (flags & FS_NOCOW_FL)
227 ip->flags |= BTRFS_INODE_NODATACOW;
228 else
229 ip->flags &= ~BTRFS_INODE_NODATACOW;
200 230
231 /*
232 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
233 * flag may be changed automatically if compression code won't make
234 * things smaller.
235 */
236 if (flags & FS_NOCOMP_FL) {
237 ip->flags &= ~BTRFS_INODE_COMPRESS;
238 ip->flags |= BTRFS_INODE_NOCOMPRESS;
239 } else if (flags & FS_COMPR_FL) {
240 ip->flags |= BTRFS_INODE_COMPRESS;
241 ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
242 } else {
243 ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
244 }
201 245
202 trans = btrfs_join_transaction(root, 1); 246 trans = btrfs_join_transaction(root);
203 BUG_ON(!trans); 247 BUG_ON(IS_ERR(trans));
204 248
205 ret = btrfs_update_inode(trans, root, inode); 249 ret = btrfs_update_inode(trans, root, inode);
206 BUG_ON(ret); 250 BUG_ON(ret);
@@ -210,9 +254,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
210 btrfs_end_transaction(trans, root); 254 btrfs_end_transaction(trans, root);
211 255
212 mnt_drop_write(file->f_path.mnt); 256 mnt_drop_write(file->f_path.mnt);
257
258 ret = 0;
213 out_unlock: 259 out_unlock:
214 mutex_unlock(&inode->i_mutex); 260 mutex_unlock(&inode->i_mutex);
215 return 0; 261 return ret;
216} 262}
217 263
218static int btrfs_ioctl_getversion(struct file *file, int __user *arg) 264static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
@@ -222,9 +268,54 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
222 return put_user(inode->i_generation, arg); 268 return put_user(inode->i_generation, arg);
223} 269}
224 270
271static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
272{
273 struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
274 struct btrfs_fs_info *fs_info = root->fs_info;
275 struct btrfs_device *device;
276 struct request_queue *q;
277 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0;
280 int ret;
281
282 if (!capable(CAP_SYS_ADMIN))
283 return -EPERM;
284
285 rcu_read_lock();
286 list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
287 dev_list) {
288 if (!device->bdev)
289 continue;
290 q = bdev_get_queue(device->bdev);
291 if (blk_queue_discard(q)) {
292 num_devices++;
293 minlen = min((u64)q->limits.discard_granularity,
294 minlen);
295 }
296 }
297 rcu_read_unlock();
298 if (!num_devices)
299 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT;
303
304 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0)
307 return ret;
308
309 if (copy_to_user(arg, &range, sizeof(range)))
310 return -EFAULT;
311
312 return 0;
313}
314
225static noinline int create_subvol(struct btrfs_root *root, 315static noinline int create_subvol(struct btrfs_root *root,
226 struct dentry *dentry, 316 struct dentry *dentry,
227 char *name, int namelen) 317 char *name, int namelen,
318 u64 *async_transid)
228{ 319{
229 struct btrfs_trans_handle *trans; 320 struct btrfs_trans_handle *trans;
230 struct btrfs_key key; 321 struct btrfs_key key;
@@ -232,17 +323,22 @@ static noinline int create_subvol(struct btrfs_root *root,
232 struct btrfs_inode_item *inode_item; 323 struct btrfs_inode_item *inode_item;
233 struct extent_buffer *leaf; 324 struct extent_buffer *leaf;
234 struct btrfs_root *new_root; 325 struct btrfs_root *new_root;
235 struct inode *dir = dentry->d_parent->d_inode; 326 struct dentry *parent = dget_parent(dentry);
327 struct inode *dir;
236 int ret; 328 int ret;
237 int err; 329 int err;
238 u64 objectid; 330 u64 objectid;
239 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; 331 u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
240 u64 index = 0; 332 u64 index = 0;
241 333
242 ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, 334 ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
243 0, &objectid); 335 if (ret) {
244 if (ret) 336 dput(parent);
245 return ret; 337 return ret;
338 }
339
340 dir = parent->d_inode;
341
246 /* 342 /*
247 * 1 - inode item 343 * 1 - inode item
248 * 2 - refs 344 * 2 - refs
@@ -250,8 +346,10 @@ static noinline int create_subvol(struct btrfs_root *root,
250 * 2 - dir items 346 * 2 - dir items
251 */ 347 */
252 trans = btrfs_start_transaction(root, 6); 348 trans = btrfs_start_transaction(root, 6);
253 if (IS_ERR(trans)) 349 if (IS_ERR(trans)) {
350 dput(parent);
254 return PTR_ERR(trans); 351 return PTR_ERR(trans);
352 }
255 353
256 leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 354 leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
257 0, objectid, NULL, 0, 0, 0); 355 0, objectid, NULL, 0, 0, 0);
@@ -282,6 +380,10 @@ static noinline int create_subvol(struct btrfs_root *root,
282 inode_item->nbytes = cpu_to_le64(root->leafsize); 380 inode_item->nbytes = cpu_to_le64(root->leafsize);
283 inode_item->mode = cpu_to_le32(S_IFDIR | 0755); 381 inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
284 382
383 root_item.flags = 0;
384 root_item.byte_limit = 0;
385 inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
386
285 btrfs_set_root_bytenr(&root_item, leaf->start); 387 btrfs_set_root_bytenr(&root_item, leaf->start);
286 btrfs_set_root_generation(&root_item, trans->transid); 388 btrfs_set_root_generation(&root_item, trans->transid);
287 btrfs_set_root_level(&root_item, 0); 389 btrfs_set_root_level(&root_item, 0);
@@ -312,8 +414,7 @@ static noinline int create_subvol(struct btrfs_root *root,
312 414
313 btrfs_record_root_in_trans(trans, new_root); 415 btrfs_record_root_in_trans(trans, new_root);
314 416
315 ret = btrfs_create_subvol_root(trans, new_root, new_dirid, 417 ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
316 BTRFS_I(dir)->block_group);
317 /* 418 /*
318 * insert the directory item 419 * insert the directory item
319 */ 420 */
@@ -321,7 +422,7 @@ static noinline int create_subvol(struct btrfs_root *root,
321 BUG_ON(ret); 422 BUG_ON(ret);
322 423
323 ret = btrfs_insert_dir_item(trans, root, 424 ret = btrfs_insert_dir_item(trans, root,
324 name, namelen, dir->i_ino, &key, 425 name, namelen, dir, &key,
325 BTRFS_FT_DIR, index); 426 BTRFS_FT_DIR, index);
326 if (ret) 427 if (ret)
327 goto fail; 428 goto fail;
@@ -332,21 +433,30 @@ static noinline int create_subvol(struct btrfs_root *root,
332 433
333 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, 434 ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
334 objectid, root->root_key.objectid, 435 objectid, root->root_key.objectid,
335 dir->i_ino, index, name, namelen); 436 btrfs_ino(dir), index, name, namelen);
336 437
337 BUG_ON(ret); 438 BUG_ON(ret);
338 439
339 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); 440 d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
340fail: 441fail:
341 err = btrfs_commit_transaction(trans, root); 442 dput(parent);
443 if (async_transid) {
444 *async_transid = trans->transid;
445 err = btrfs_commit_transaction_async(trans, root, 1);
446 } else {
447 err = btrfs_commit_transaction(trans, root);
448 }
342 if (err && !ret) 449 if (err && !ret)
343 ret = err; 450 ret = err;
344 return ret; 451 return ret;
345} 452}
346 453
347static int create_snapshot(struct btrfs_root *root, struct dentry *dentry) 454static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
455 char *name, int namelen, u64 *async_transid,
456 bool readonly)
348{ 457{
349 struct inode *inode; 458 struct inode *inode;
459 struct dentry *parent;
350 struct btrfs_pending_snapshot *pending_snapshot; 460 struct btrfs_pending_snapshot *pending_snapshot;
351 struct btrfs_trans_handle *trans; 461 struct btrfs_trans_handle *trans;
352 int ret; 462 int ret;
@@ -361,6 +471,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
361 btrfs_init_block_rsv(&pending_snapshot->block_rsv); 471 btrfs_init_block_rsv(&pending_snapshot->block_rsv);
362 pending_snapshot->dentry = dentry; 472 pending_snapshot->dentry = dentry;
363 pending_snapshot->root = root; 473 pending_snapshot->root = root;
474 pending_snapshot->readonly = readonly;
364 475
365 trans = btrfs_start_transaction(root->fs_info->extent_root, 5); 476 trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
366 if (IS_ERR(trans)) { 477 if (IS_ERR(trans)) {
@@ -371,18 +482,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
371 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot); 482 ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
372 BUG_ON(ret); 483 BUG_ON(ret);
373 484
485 spin_lock(&root->fs_info->trans_lock);
374 list_add(&pending_snapshot->list, 486 list_add(&pending_snapshot->list,
375 &trans->transaction->pending_snapshots); 487 &trans->transaction->pending_snapshots);
376 ret = btrfs_commit_transaction(trans, root->fs_info->extent_root); 488 spin_unlock(&root->fs_info->trans_lock);
489 if (async_transid) {
490 *async_transid = trans->transid;
491 ret = btrfs_commit_transaction_async(trans,
492 root->fs_info->extent_root, 1);
493 } else {
494 ret = btrfs_commit_transaction(trans,
495 root->fs_info->extent_root);
496 }
377 BUG_ON(ret); 497 BUG_ON(ret);
378 498
379 ret = pending_snapshot->error; 499 ret = pending_snapshot->error;
380 if (ret) 500 if (ret)
381 goto fail; 501 goto fail;
382 502
383 btrfs_orphan_cleanup(pending_snapshot->snap); 503 ret = btrfs_orphan_cleanup(pending_snapshot->snap);
504 if (ret)
505 goto fail;
384 506
385 inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); 507 parent = dget_parent(dentry);
508 inode = btrfs_lookup_dentry(parent->d_inode, dentry);
509 dput(parent);
386 if (IS_ERR(inode)) { 510 if (IS_ERR(inode)) {
387 ret = PTR_ERR(inode); 511 ret = PTR_ERR(inode);
388 goto fail; 512 goto fail;
@@ -395,6 +519,76 @@ fail:
395 return ret; 519 return ret;
396} 520}
397 521
522/* copy of check_sticky in fs/namei.c()
523* It's inline, so penalty for filesystems that don't use sticky bit is
524* minimal.
525*/
526static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
527{
528 uid_t fsuid = current_fsuid();
529
530 if (!(dir->i_mode & S_ISVTX))
531 return 0;
532 if (inode->i_uid == fsuid)
533 return 0;
534 if (dir->i_uid == fsuid)
535 return 0;
536 return !capable(CAP_FOWNER);
537}
538
539/* copy of may_delete in fs/namei.c()
540 * Check whether we can remove a link victim from directory dir, check
541 * whether the type of victim is right.
542 * 1. We can't do it if dir is read-only (done in permission())
543 * 2. We should have write and exec permissions on dir
544 * 3. We can't remove anything from append-only dir
545 * 4. We can't do anything with immutable dir (done in permission())
546 * 5. If the sticky bit on dir is set we should either
547 * a. be owner of dir, or
548 * b. be owner of victim, or
549 * c. have CAP_FOWNER capability
550 * 6. If the victim is append-only or immutable we can't do antyhing with
551 * links pointing to it.
552 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
553 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
554 * 9. We can't remove a root or mountpoint.
555 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
556 * nfs_async_unlink().
557 */
558
559static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
560{
561 int error;
562
563 if (!victim->d_inode)
564 return -ENOENT;
565
566 BUG_ON(victim->d_parent->d_inode != dir);
567 audit_inode_child(victim, dir);
568
569 error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
570 if (error)
571 return error;
572 if (IS_APPEND(dir))
573 return -EPERM;
574 if (btrfs_check_sticky(dir, victim->d_inode)||
575 IS_APPEND(victim->d_inode)||
576 IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
577 return -EPERM;
578 if (isdir) {
579 if (!S_ISDIR(victim->d_inode->i_mode))
580 return -ENOTDIR;
581 if (IS_ROOT(victim))
582 return -EBUSY;
583 } else if (S_ISDIR(victim->d_inode->i_mode))
584 return -EISDIR;
585 if (IS_DEADDIR(dir))
586 return -ENOENT;
587 if (victim->d_flags & DCACHE_NFSFS_RENAMED)
588 return -EBUSY;
589 return 0;
590}
591
398/* copy of may_create in fs/namei.c() */ 592/* copy of may_create in fs/namei.c() */
399static inline int btrfs_may_create(struct inode *dir, struct dentry *child) 593static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
400{ 594{
@@ -412,7 +606,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
412 */ 606 */
413static noinline int btrfs_mksubvol(struct path *parent, 607static noinline int btrfs_mksubvol(struct path *parent,
414 char *name, int namelen, 608 char *name, int namelen,
415 struct btrfs_root *snap_src) 609 struct btrfs_root *snap_src,
610 u64 *async_transid, bool readonly)
416{ 611{
417 struct inode *dir = parent->dentry->d_inode; 612 struct inode *dir = parent->dentry->d_inode;
418 struct dentry *dentry; 613 struct dentry *dentry;
@@ -443,10 +638,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
443 goto out_up_read; 638 goto out_up_read;
444 639
445 if (snap_src) { 640 if (snap_src) {
446 error = create_snapshot(snap_src, dentry); 641 error = create_snapshot(snap_src, dentry,
642 name, namelen, async_transid, readonly);
447 } else { 643 } else {
448 error = create_subvol(BTRFS_I(dir)->root, dentry, 644 error = create_subvol(BTRFS_I(dir)->root, dentry,
449 name, namelen); 645 name, namelen, async_transid);
450 } 646 }
451 if (!error) 647 if (!error)
452 fsnotify_mkdir(dir, dentry); 648 fsnotify_mkdir(dir, dentry);
@@ -461,6 +657,107 @@ out_unlock:
461 return error; 657 return error;
462} 658}
463 659
660/*
661 * When we're defragging a range, we don't want to kick it off again
662 * if it is really just waiting for delalloc to send it down.
663 * If we find a nice big extent or delalloc range for the bytes in the
664 * file you want to defrag, we return 0 to let you know to skip this
665 * part of the file
666 */
667static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
668{
669 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
670 struct extent_map *em = NULL;
671 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
672 u64 end;
673
674 read_lock(&em_tree->lock);
675 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
676 read_unlock(&em_tree->lock);
677
678 if (em) {
679 end = extent_map_end(em);
680 free_extent_map(em);
681 if (end - offset > thresh)
682 return 0;
683 }
684 /* if we already have a nice delalloc here, just stop */
685 thresh /= 2;
686 end = count_range_bits(io_tree, &offset, offset + thresh,
687 thresh, EXTENT_DELALLOC, 1);
688 if (end >= thresh)
689 return 0;
690 return 1;
691}
692
693/*
694 * helper function to walk through a file and find extents
695 * newer than a specific transid, and smaller than thresh.
696 *
697 * This is used by the defragging code to find new and small
698 * extents
699 */
700static int find_new_extents(struct btrfs_root *root,
701 struct inode *inode, u64 newer_than,
702 u64 *off, int thresh)
703{
704 struct btrfs_path *path;
705 struct btrfs_key min_key;
706 struct btrfs_key max_key;
707 struct extent_buffer *leaf;
708 struct btrfs_file_extent_item *extent;
709 int type;
710 int ret;
711 u64 ino = btrfs_ino(inode);
712
713 path = btrfs_alloc_path();
714 if (!path)
715 return -ENOMEM;
716
717 min_key.objectid = ino;
718 min_key.type = BTRFS_EXTENT_DATA_KEY;
719 min_key.offset = *off;
720
721 max_key.objectid = ino;
722 max_key.type = (u8)-1;
723 max_key.offset = (u64)-1;
724
725 path->keep_locks = 1;
726
727 while(1) {
728 ret = btrfs_search_forward(root, &min_key, &max_key,
729 path, 0, newer_than);
730 if (ret != 0)
731 goto none;
732 if (min_key.objectid != ino)
733 goto none;
734 if (min_key.type != BTRFS_EXTENT_DATA_KEY)
735 goto none;
736
737 leaf = path->nodes[0];
738 extent = btrfs_item_ptr(leaf, path->slots[0],
739 struct btrfs_file_extent_item);
740
741 type = btrfs_file_extent_type(leaf, extent);
742 if (type == BTRFS_FILE_EXTENT_REG &&
743 btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
744 check_defrag_in_cache(inode, min_key.offset, thresh)) {
745 *off = min_key.offset;
746 btrfs_free_path(path);
747 return 0;
748 }
749
750 if (min_key.offset == (u64)-1)
751 goto none;
752
753 min_key.offset++;
754 btrfs_release_path(path);
755 }
756none:
757 btrfs_free_path(path);
758 return -ENOENT;
759}
760
464static int should_defrag_range(struct inode *inode, u64 start, u64 len, 761static int should_defrag_range(struct inode *inode, u64 start, u64 len,
465 int thresh, u64 *last_len, u64 *skip, 762 int thresh, u64 *last_len, u64 *skip,
466 u64 *defrag_end) 763 u64 *defrag_end)
@@ -470,10 +767,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
470 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; 767 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
471 int ret = 1; 768 int ret = 1;
472 769
473
474 if (thresh == 0)
475 thresh = 256 * 1024;
476
477 /* 770 /*
478 * make sure that once we start defragging and extent, we keep on 771 * make sure that once we start defragging and extent, we keep on
479 * defragging it 772 * defragging it
@@ -532,28 +825,208 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
532 return ret; 825 return ret;
533} 826}
534 827
535static int btrfs_defrag_file(struct file *file, 828/*
536 struct btrfs_ioctl_defrag_range_args *range) 829 * it doesn't do much good to defrag one or two pages
830 * at a time. This pulls in a nice chunk of pages
831 * to COW and defrag.
832 *
833 * It also makes sure the delalloc code has enough
834 * dirty data to avoid making new small extents as part
835 * of the defrag
836 *
837 * It's a good idea to start RA on this range
838 * before calling this.
839 */
840static int cluster_pages_for_defrag(struct inode *inode,
841 struct page **pages,
842 unsigned long start_index,
843 int num_pages)
537{ 844{
538 struct inode *inode = fdentry(file)->d_inode; 845 unsigned long file_end;
539 struct btrfs_root *root = BTRFS_I(inode)->root; 846 u64 isize = i_size_read(inode);
540 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
541 struct btrfs_ordered_extent *ordered;
542 struct page *page;
543 unsigned long last_index;
544 unsigned long ra_pages = root->fs_info->bdi.ra_pages;
545 unsigned long total_read = 0;
546 u64 page_start; 847 u64 page_start;
547 u64 page_end; 848 u64 page_end;
849 int ret;
850 int i;
851 int i_done;
852 struct btrfs_ordered_extent *ordered;
853 struct extent_state *cached_state = NULL;
854
855 if (isize == 0)
856 return 0;
857 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
858
859 ret = btrfs_delalloc_reserve_space(inode,
860 num_pages << PAGE_CACHE_SHIFT);
861 if (ret)
862 return ret;
863again:
864 ret = 0;
865 i_done = 0;
866
867 /* step one, lock all the pages */
868 for (i = 0; i < num_pages; i++) {
869 struct page *page;
870 page = grab_cache_page(inode->i_mapping,
871 start_index + i);
872 if (!page)
873 break;
874
875 if (!PageUptodate(page)) {
876 btrfs_readpage(NULL, page);
877 lock_page(page);
878 if (!PageUptodate(page)) {
879 unlock_page(page);
880 page_cache_release(page);
881 ret = -EIO;
882 break;
883 }
884 }
885 isize = i_size_read(inode);
886 file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
887 if (!isize || page->index > file_end ||
888 page->mapping != inode->i_mapping) {
889 /* whoops, we blew past eof, skip this page */
890 unlock_page(page);
891 page_cache_release(page);
892 break;
893 }
894 pages[i] = page;
895 i_done++;
896 }
897 if (!i_done || ret)
898 goto out;
899
900 if (!(inode->i_sb->s_flags & MS_ACTIVE))
901 goto out;
902
903 /*
904 * so now we have a nice long stream of locked
905 * and up to date pages, lets wait on them
906 */
907 for (i = 0; i < i_done; i++)
908 wait_on_page_writeback(pages[i]);
909
910 page_start = page_offset(pages[0]);
911 page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
912
913 lock_extent_bits(&BTRFS_I(inode)->io_tree,
914 page_start, page_end - 1, 0, &cached_state,
915 GFP_NOFS);
916 ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
917 if (ordered &&
918 ordered->file_offset + ordered->len > page_start &&
919 ordered->file_offset < page_end) {
920 btrfs_put_ordered_extent(ordered);
921 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
922 page_start, page_end - 1,
923 &cached_state, GFP_NOFS);
924 for (i = 0; i < i_done; i++) {
925 unlock_page(pages[i]);
926 page_cache_release(pages[i]);
927 }
928 btrfs_wait_ordered_range(inode, page_start,
929 page_end - page_start);
930 goto again;
931 }
932 if (ordered)
933 btrfs_put_ordered_extent(ordered);
934
935 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
936 page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
937 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
938 GFP_NOFS);
939
940 if (i_done != num_pages) {
941 atomic_inc(&BTRFS_I(inode)->outstanding_extents);
942 btrfs_delalloc_release_space(inode,
943 (num_pages - i_done) << PAGE_CACHE_SHIFT);
944 }
945
946
947 btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
948 &cached_state);
949
950 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
951 page_start, page_end - 1, &cached_state,
952 GFP_NOFS);
953
954 for (i = 0; i < i_done; i++) {
955 clear_page_dirty_for_io(pages[i]);
956 ClearPageChecked(pages[i]);
957 set_page_extent_mapped(pages[i]);
958 set_page_dirty(pages[i]);
959 unlock_page(pages[i]);
960 page_cache_release(pages[i]);
961 }
962 return i_done;
963out:
964 for (i = 0; i < i_done; i++) {
965 unlock_page(pages[i]);
966 page_cache_release(pages[i]);
967 }
968 btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
969 return ret;
970
971}
972
973int btrfs_defrag_file(struct inode *inode, struct file *file,
974 struct btrfs_ioctl_defrag_range_args *range,
975 u64 newer_than, unsigned long max_to_defrag)
976{
977 struct btrfs_root *root = BTRFS_I(inode)->root;
978 struct btrfs_super_block *disk_super;
979 struct file_ra_state *ra = NULL;
980 unsigned long last_index;
981 u64 features;
548 u64 last_len = 0; 982 u64 last_len = 0;
549 u64 skip = 0; 983 u64 skip = 0;
550 u64 defrag_end = 0; 984 u64 defrag_end = 0;
985 u64 newer_off = range->start;
986 int newer_left = 0;
551 unsigned long i; 987 unsigned long i;
552 int ret; 988 int ret;
989 int defrag_count = 0;
990 int compress_type = BTRFS_COMPRESS_ZLIB;
991 int extent_thresh = range->extent_thresh;
992 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
993 u64 new_align = ~((u64)128 * 1024 - 1);
994 struct page **pages = NULL;
995
996 if (extent_thresh == 0)
997 extent_thresh = 256 * 1024;
998
999 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
1000 if (range->compress_type > BTRFS_COMPRESS_TYPES)
1001 return -EINVAL;
1002 if (range->compress_type)
1003 compress_type = range->compress_type;
1004 }
553 1005
554 if (inode->i_size == 0) 1006 if (inode->i_size == 0)
555 return 0; 1007 return 0;
556 1008
1009 /*
1010 * if we were not given a file, allocate a readahead
1011 * context
1012 */
1013 if (!file) {
1014 ra = kzalloc(sizeof(*ra), GFP_NOFS);
1015 if (!ra)
1016 return -ENOMEM;
1017 file_ra_state_init(ra, inode->i_mapping);
1018 } else {
1019 ra = &file->f_ra;
1020 }
1021
1022 pages = kmalloc(sizeof(struct page *) * newer_cluster,
1023 GFP_NOFS);
1024 if (!pages) {
1025 ret = -ENOMEM;
1026 goto out_ra;
1027 }
1028
1029 /* find the last page to defrag */
557 if (range->start + range->len > range->start) { 1030 if (range->start + range->len > range->start) {
558 last_index = min_t(u64, inode->i_size - 1, 1031 last_index = min_t(u64, inode->i_size - 1,
559 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1032 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
@@ -561,11 +1034,37 @@ static int btrfs_defrag_file(struct file *file,
561 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1034 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
562 } 1035 }
563 1036
564 i = range->start >> PAGE_CACHE_SHIFT; 1037 if (newer_than) {
565 while (i <= last_index) { 1038 ret = find_new_extents(root, inode, newer_than,
566 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, 1039 &newer_off, 64 * 1024);
1040 if (!ret) {
1041 range->start = newer_off;
1042 /*
1043 * we always align our defrag to help keep
1044 * the extents in the file evenly spaced
1045 */
1046 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1047 newer_left = newer_cluster;
1048 } else
1049 goto out_ra;
1050 } else {
1051 i = range->start >> PAGE_CACHE_SHIFT;
1052 }
1053 if (!max_to_defrag)
1054 max_to_defrag = last_index - 1;
1055
1056 while (i <= last_index && defrag_count < max_to_defrag) {
1057 /*
1058 * make sure we stop running if someone unmounts
1059 * the FS
1060 */
1061 if (!(inode->i_sb->s_flags & MS_ACTIVE))
1062 break;
1063
1064 if (!newer_than &&
1065 !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
567 PAGE_CACHE_SIZE, 1066 PAGE_CACHE_SIZE,
568 range->extent_thresh, 1067 extent_thresh,
569 &last_len, &skip, 1068 &last_len, &skip,
570 &defrag_end)) { 1069 &defrag_end)) {
571 unsigned long next; 1070 unsigned long next;
@@ -577,92 +1076,39 @@ static int btrfs_defrag_file(struct file *file,
577 i = max(i + 1, next); 1076 i = max(i + 1, next);
578 continue; 1077 continue;
579 } 1078 }
580
581 if (total_read % ra_pages == 0) {
582 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
583 min(last_index, i + ra_pages - 1));
584 }
585 total_read++;
586 mutex_lock(&inode->i_mutex);
587 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1079 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
588 BTRFS_I(inode)->force_compress = 1; 1080 BTRFS_I(inode)->force_compress = compress_type;
589 1081
590 ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); 1082 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
591 if (ret)
592 goto err_unlock;
593again:
594 if (inode->i_size == 0 ||
595 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
596 ret = 0;
597 goto err_reservations;
598 }
599 1083
600 page = grab_cache_page(inode->i_mapping, i); 1084 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
601 if (!page) { 1085 if (ret < 0)
602 ret = -ENOMEM; 1086 goto out_ra;
603 goto err_reservations;
604 }
605
606 if (!PageUptodate(page)) {
607 btrfs_readpage(NULL, page);
608 lock_page(page);
609 if (!PageUptodate(page)) {
610 unlock_page(page);
611 page_cache_release(page);
612 ret = -EIO;
613 goto err_reservations;
614 }
615 }
616
617 if (page->mapping != inode->i_mapping) {
618 unlock_page(page);
619 page_cache_release(page);
620 goto again;
621 }
622
623 wait_on_page_writeback(page);
624 1087
625 if (PageDirty(page)) { 1088 defrag_count += ret;
626 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1089 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
627 goto loop_unlock; 1090 i += ret;
628 }
629 1091
630 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 1092 if (newer_than) {
631 page_end = page_start + PAGE_CACHE_SIZE - 1; 1093 if (newer_off == (u64)-1)
632 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 1094 break;
633 1095
634 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1096 newer_off = max(newer_off + 1,
635 if (ordered) { 1097 (u64)i << PAGE_CACHE_SHIFT);
636 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 1098
637 unlock_page(page); 1099 ret = find_new_extents(root, inode,
638 page_cache_release(page); 1100 newer_than, &newer_off,
639 btrfs_start_ordered_extent(inode, ordered, 1); 1101 64 * 1024);
640 btrfs_put_ordered_extent(ordered); 1102 if (!ret) {
641 goto again; 1103 range->start = newer_off;
1104 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1105 newer_left = newer_cluster;
1106 } else {
1107 break;
1108 }
1109 } else {
1110 i++;
642 } 1111 }
643 set_page_extent_mapped(page);
644
645 /*
646 * this makes sure page_mkwrite is called on the
647 * page if it is dirtied again later
648 */
649 clear_page_dirty_for_io(page);
650 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
651 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
652 EXTENT_DO_ACCOUNTING, GFP_NOFS);
653
654 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
655 ClearPageChecked(page);
656 set_page_dirty(page);
657 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
658
659loop_unlock:
660 unlock_page(page);
661 page_cache_release(page);
662 mutex_unlock(&inode->i_mutex);
663
664 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
665 i++;
666 } 1112 }
667 1113
668 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) 1114 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
@@ -683,16 +1129,25 @@ loop_unlock:
683 atomic_dec(&root->fs_info->async_submit_draining); 1129 atomic_dec(&root->fs_info->async_submit_draining);
684 1130
685 mutex_lock(&inode->i_mutex); 1131 mutex_lock(&inode->i_mutex);
686 BTRFS_I(inode)->force_compress = 0; 1132 BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
687 mutex_unlock(&inode->i_mutex); 1133 mutex_unlock(&inode->i_mutex);
688 } 1134 }
689 1135
690 return 0; 1136 disk_super = &root->fs_info->super_copy;
1137 features = btrfs_super_incompat_flags(disk_super);
1138 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1139 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1140 btrfs_set_super_incompat_flags(disk_super, features);
1141 }
691 1142
692err_reservations: 1143 if (!file)
693 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 1144 kfree(ra);
694err_unlock: 1145 return defrag_count;
695 mutex_unlock(&inode->i_mutex); 1146
1147out_ra:
1148 if (!file)
1149 kfree(ra);
1150 kfree(pages);
696 return ret; 1151 return ret;
697} 1152}
698 1153
@@ -708,7 +1163,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
708 char *sizestr; 1163 char *sizestr;
709 char *devstr = NULL; 1164 char *devstr = NULL;
710 int ret = 0; 1165 int ret = 0;
711 int namelen;
712 int mod = 0; 1166 int mod = 0;
713 1167
714 if (root->fs_info->sb->s_flags & MS_RDONLY) 1168 if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -722,7 +1176,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
722 return PTR_ERR(vol_args); 1176 return PTR_ERR(vol_args);
723 1177
724 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; 1178 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
725 namelen = strlen(vol_args->name);
726 1179
727 mutex_lock(&root->fs_info->volume_mutex); 1180 mutex_lock(&root->fs_info->volume_mutex);
728 sizestr = vol_args->name; 1181 sizestr = vol_args->name;
@@ -789,6 +1242,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
789 1242
790 if (new_size > old_size) { 1243 if (new_size > old_size) {
791 trans = btrfs_start_transaction(root, 0); 1244 trans = btrfs_start_transaction(root, 0);
1245 if (IS_ERR(trans)) {
1246 ret = PTR_ERR(trans);
1247 goto out_unlock;
1248 }
792 ret = btrfs_grow_device(trans, device, new_size); 1249 ret = btrfs_grow_device(trans, device, new_size);
793 btrfs_commit_transaction(trans, root); 1250 btrfs_commit_transaction(trans, root);
794 } else { 1251 } else {
@@ -801,11 +1258,14 @@ out_unlock:
801 return ret; 1258 return ret;
802} 1259}
803 1260
804static noinline int btrfs_ioctl_snap_create(struct file *file, 1261static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
805 void __user *arg, int subvol) 1262 char *name,
1263 unsigned long fd,
1264 int subvol,
1265 u64 *transid,
1266 bool readonly)
806{ 1267{
807 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; 1268 struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
808 struct btrfs_ioctl_vol_args *vol_args;
809 struct file *src_file; 1269 struct file *src_file;
810 int namelen; 1270 int namelen;
811 int ret = 0; 1271 int ret = 0;
@@ -813,23 +1273,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
813 if (root->fs_info->sb->s_flags & MS_RDONLY) 1273 if (root->fs_info->sb->s_flags & MS_RDONLY)
814 return -EROFS; 1274 return -EROFS;
815 1275
816 vol_args = memdup_user(arg, sizeof(*vol_args)); 1276 namelen = strlen(name);
817 if (IS_ERR(vol_args)) 1277 if (strchr(name, '/')) {
818 return PTR_ERR(vol_args);
819
820 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
821 namelen = strlen(vol_args->name);
822 if (strchr(vol_args->name, '/')) {
823 ret = -EINVAL; 1278 ret = -EINVAL;
824 goto out; 1279 goto out;
825 } 1280 }
826 1281
827 if (subvol) { 1282 if (subvol) {
828 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 1283 ret = btrfs_mksubvol(&file->f_path, name, namelen,
829 NULL); 1284 NULL, transid, readonly);
830 } else { 1285 } else {
831 struct inode *src_inode; 1286 struct inode *src_inode;
832 src_file = fget(vol_args->fd); 1287 src_file = fget(fd);
833 if (!src_file) { 1288 if (!src_file) {
834 ret = -EINVAL; 1289 ret = -EINVAL;
835 goto out; 1290 goto out;
@@ -843,15 +1298,155 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
843 fput(src_file); 1298 fput(src_file);
844 goto out; 1299 goto out;
845 } 1300 }
846 ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen, 1301 ret = btrfs_mksubvol(&file->f_path, name, namelen,
847 BTRFS_I(src_inode)->root); 1302 BTRFS_I(src_inode)->root,
1303 transid, readonly);
848 fput(src_file); 1304 fput(src_file);
849 } 1305 }
850out: 1306out:
1307 return ret;
1308}
1309
1310static noinline int btrfs_ioctl_snap_create(struct file *file,
1311 void __user *arg, int subvol)
1312{
1313 struct btrfs_ioctl_vol_args *vol_args;
1314 int ret;
1315
1316 vol_args = memdup_user(arg, sizeof(*vol_args));
1317 if (IS_ERR(vol_args))
1318 return PTR_ERR(vol_args);
1319 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1320
1321 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1322 vol_args->fd, subvol,
1323 NULL, false);
1324
1325 kfree(vol_args);
1326 return ret;
1327}
1328
1329static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1330 void __user *arg, int subvol)
1331{
1332 struct btrfs_ioctl_vol_args_v2 *vol_args;
1333 int ret;
1334 u64 transid = 0;
1335 u64 *ptr = NULL;
1336 bool readonly = false;
1337
1338 vol_args = memdup_user(arg, sizeof(*vol_args));
1339 if (IS_ERR(vol_args))
1340 return PTR_ERR(vol_args);
1341 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1342
1343 if (vol_args->flags &
1344 ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
1345 ret = -EOPNOTSUPP;
1346 goto out;
1347 }
1348
1349 if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
1350 ptr = &transid;
1351 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1352 readonly = true;
1353
1354 ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
1355 vol_args->fd, subvol,
1356 ptr, readonly);
1357
1358 if (ret == 0 && ptr &&
1359 copy_to_user(arg +
1360 offsetof(struct btrfs_ioctl_vol_args_v2,
1361 transid), ptr, sizeof(*ptr)))
1362 ret = -EFAULT;
1363out:
851 kfree(vol_args); 1364 kfree(vol_args);
852 return ret; 1365 return ret;
853} 1366}
854 1367
1368static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
1369 void __user *arg)
1370{
1371 struct inode *inode = fdentry(file)->d_inode;
1372 struct btrfs_root *root = BTRFS_I(inode)->root;
1373 int ret = 0;
1374 u64 flags = 0;
1375
1376 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
1377 return -EINVAL;
1378
1379 down_read(&root->fs_info->subvol_sem);
1380 if (btrfs_root_readonly(root))
1381 flags |= BTRFS_SUBVOL_RDONLY;
1382 up_read(&root->fs_info->subvol_sem);
1383
1384 if (copy_to_user(arg, &flags, sizeof(flags)))
1385 ret = -EFAULT;
1386
1387 return ret;
1388}
1389
1390static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1391 void __user *arg)
1392{
1393 struct inode *inode = fdentry(file)->d_inode;
1394 struct btrfs_root *root = BTRFS_I(inode)->root;
1395 struct btrfs_trans_handle *trans;
1396 u64 root_flags;
1397 u64 flags;
1398 int ret = 0;
1399
1400 if (root->fs_info->sb->s_flags & MS_RDONLY)
1401 return -EROFS;
1402
1403 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
1404 return -EINVAL;
1405
1406 if (copy_from_user(&flags, arg, sizeof(flags)))
1407 return -EFAULT;
1408
1409 if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
1410 return -EINVAL;
1411
1412 if (flags & ~BTRFS_SUBVOL_RDONLY)
1413 return -EOPNOTSUPP;
1414
1415 if (!inode_owner_or_capable(inode))
1416 return -EACCES;
1417
1418 down_write(&root->fs_info->subvol_sem);
1419
1420 /* nothing to do */
1421 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1422 goto out;
1423
1424 root_flags = btrfs_root_flags(&root->root_item);
1425 if (flags & BTRFS_SUBVOL_RDONLY)
1426 btrfs_set_root_flags(&root->root_item,
1427 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1428 else
1429 btrfs_set_root_flags(&root->root_item,
1430 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1431
1432 trans = btrfs_start_transaction(root, 1);
1433 if (IS_ERR(trans)) {
1434 ret = PTR_ERR(trans);
1435 goto out_reset;
1436 }
1437
1438 ret = btrfs_update_root(trans, root->fs_info->tree_root,
1439 &root->root_key, &root->root_item);
1440
1441 btrfs_commit_transaction(trans, root);
1442out_reset:
1443 if (ret)
1444 btrfs_set_root_flags(&root->root_item, root_flags);
1445out:
1446 up_write(&root->fs_info->subvol_sem);
1447 return ret;
1448}
1449
855/* 1450/*
856 * helper to check if the subvolume references other subvolumes 1451 * helper to check if the subvolume references other subvolumes
857 */ 1452 */
@@ -928,7 +1523,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
928 int nritems; 1523 int nritems;
929 int i; 1524 int i;
930 int slot; 1525 int slot;
931 int found = 0;
932 int ret = 0; 1526 int ret = 0;
933 1527
934 leaf = path->nodes[0]; 1528 leaf = path->nodes[0];
@@ -975,7 +1569,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
975 item_off, item_len); 1569 item_off, item_len);
976 *sk_offset += item_len; 1570 *sk_offset += item_len;
977 } 1571 }
978 found++; 1572 (*num_found)++;
979 1573
980 if (*num_found >= sk->nr_items) 1574 if (*num_found >= sk->nr_items)
981 break; 1575 break;
@@ -994,7 +1588,6 @@ advance_key:
994 } else 1588 } else
995 ret = 1; 1589 ret = 1;
996overflow: 1590overflow:
997 *num_found += found;
998 return ret; 1591 return ret;
999} 1592}
1000 1593
@@ -1051,7 +1644,7 @@ static noinline int search_ioctl(struct inode *inode,
1051 } 1644 }
1052 ret = copy_to_sk(root, path, &key, sk, args->buf, 1645 ret = copy_to_sk(root, path, &key, sk, args->buf,
1053 &sk_offset, &num_found); 1646 &sk_offset, &num_found);
1054 btrfs_release_path(root, path); 1647 btrfs_release_path(path);
1055 if (ret || num_found >= sk->nr_items) 1648 if (ret || num_found >= sk->nr_items)
1056 break; 1649 break;
1057 1650
@@ -1073,14 +1666,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
1073 if (!capable(CAP_SYS_ADMIN)) 1666 if (!capable(CAP_SYS_ADMIN))
1074 return -EPERM; 1667 return -EPERM;
1075 1668
1076 args = kmalloc(sizeof(*args), GFP_KERNEL); 1669 args = memdup_user(argp, sizeof(*args));
1077 if (!args) 1670 if (IS_ERR(args))
1078 return -ENOMEM; 1671 return PTR_ERR(args);
1079 1672
1080 if (copy_from_user(args, argp, sizeof(*args))) {
1081 kfree(args);
1082 return -EFAULT;
1083 }
1084 inode = fdentry(file)->d_inode; 1673 inode = fdentry(file)->d_inode;
1085 ret = search_ioctl(inode, args); 1674 ret = search_ioctl(inode, args);
1086 if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) 1675 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
@@ -1162,7 +1751,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1162 if (key.offset == BTRFS_FIRST_FREE_OBJECTID) 1751 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1163 break; 1752 break;
1164 1753
1165 btrfs_release_path(root, path); 1754 btrfs_release_path(path);
1166 key.objectid = key.offset; 1755 key.objectid = key.offset;
1167 key.offset = (u64)-1; 1756 key.offset = (u64)-1;
1168 dirid = key.objectid; 1757 dirid = key.objectid;
@@ -1188,14 +1777,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1188 if (!capable(CAP_SYS_ADMIN)) 1777 if (!capable(CAP_SYS_ADMIN))
1189 return -EPERM; 1778 return -EPERM;
1190 1779
1191 args = kmalloc(sizeof(*args), GFP_KERNEL); 1780 args = memdup_user(argp, sizeof(*args));
1192 if (!args) 1781 if (IS_ERR(args))
1193 return -ENOMEM; 1782 return PTR_ERR(args);
1194 1783
1195 if (copy_from_user(args, argp, sizeof(*args))) {
1196 kfree(args);
1197 return -EFAULT;
1198 }
1199 inode = fdentry(file)->d_inode; 1784 inode = fdentry(file)->d_inode;
1200 1785
1201 if (args->treeid == 0) 1786 if (args->treeid == 0)
@@ -1227,9 +1812,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1227 int ret; 1812 int ret;
1228 int err = 0; 1813 int err = 0;
1229 1814
1230 if (!capable(CAP_SYS_ADMIN))
1231 return -EPERM;
1232
1233 vol_args = memdup_user(arg, sizeof(*vol_args)); 1815 vol_args = memdup_user(arg, sizeof(*vol_args));
1234 if (IS_ERR(vol_args)) 1816 if (IS_ERR(vol_args))
1235 return PTR_ERR(vol_args); 1817 return PTR_ERR(vol_args);
@@ -1259,12 +1841,50 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1259 } 1841 }
1260 1842
1261 inode = dentry->d_inode; 1843 inode = dentry->d_inode;
1262 if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { 1844 dest = BTRFS_I(inode)->root;
1845 if (!capable(CAP_SYS_ADMIN)){
1846 /*
1847 * Regular user. Only allow this with a special mount
1848 * option, when the user has write+exec access to the
1849 * subvol root, and when rmdir(2) would have been
1850 * allowed.
1851 *
1852 * Note that this is _not_ check that the subvol is
1853 * empty or doesn't contain data that we wouldn't
1854 * otherwise be able to delete.
1855 *
1856 * Users who want to delete empty subvols should try
1857 * rmdir(2).
1858 */
1859 err = -EPERM;
1860 if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
1861 goto out_dput;
1862
1863 /*
1864 * Do not allow deletion if the parent dir is the same
1865 * as the dir to be deleted. That means the ioctl
1866 * must be called on the dentry referencing the root
1867 * of the subvol, not a random directory contained
1868 * within it.
1869 */
1263 err = -EINVAL; 1870 err = -EINVAL;
1264 goto out_dput; 1871 if (root == dest)
1872 goto out_dput;
1873
1874 err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
1875 if (err)
1876 goto out_dput;
1877
1878 /* check if subvolume may be deleted by a non-root user */
1879 err = btrfs_may_delete(dir, dentry, 1);
1880 if (err)
1881 goto out_dput;
1265 } 1882 }
1266 1883
1267 dest = BTRFS_I(inode)->root; 1884 if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
1885 err = -EINVAL;
1886 goto out_dput;
1887 }
1268 1888
1269 mutex_lock(&inode->i_mutex); 1889 mutex_lock(&inode->i_mutex);
1270 err = d_invalidate(dentry); 1890 err = d_invalidate(dentry);
@@ -1304,7 +1924,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
1304 BUG_ON(ret); 1924 BUG_ON(ret);
1305 } 1925 }
1306 1926
1307 ret = btrfs_commit_transaction(trans, root); 1927 ret = btrfs_end_transaction(trans, root);
1308 BUG_ON(ret); 1928 BUG_ON(ret);
1309 inode->i_flags |= S_DEAD; 1929 inode->i_flags |= S_DEAD;
1310out_up_write: 1930out_up_write:
@@ -1333,6 +1953,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1333 struct btrfs_ioctl_defrag_range_args *range; 1953 struct btrfs_ioctl_defrag_range_args *range;
1334 int ret; 1954 int ret;
1335 1955
1956 if (btrfs_root_readonly(root))
1957 return -EROFS;
1958
1336 ret = mnt_want_write(file->f_path.mnt); 1959 ret = mnt_want_write(file->f_path.mnt);
1337 if (ret) 1960 if (ret)
1338 return ret; 1961 return ret;
@@ -1376,7 +1999,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
1376 /* the rest are all set to zero by kzalloc */ 1999 /* the rest are all set to zero by kzalloc */
1377 range->len = (u64)-1; 2000 range->len = (u64)-1;
1378 } 2001 }
1379 ret = btrfs_defrag_file(file, range); 2002 ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
2003 range, 0, 0);
2004 if (ret > 0)
2005 ret = 0;
1380 kfree(range); 2006 kfree(range);
1381 break; 2007 break;
1382 default: 2008 default:
@@ -1428,6 +2054,80 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
1428 return ret; 2054 return ret;
1429} 2055}
1430 2056
2057static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
2058{
2059 struct btrfs_ioctl_fs_info_args *fi_args;
2060 struct btrfs_device *device;
2061 struct btrfs_device *next;
2062 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2063 int ret = 0;
2064
2065 if (!capable(CAP_SYS_ADMIN))
2066 return -EPERM;
2067
2068 fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
2069 if (!fi_args)
2070 return -ENOMEM;
2071
2072 fi_args->num_devices = fs_devices->num_devices;
2073 memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
2074
2075 mutex_lock(&fs_devices->device_list_mutex);
2076 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
2077 if (device->devid > fi_args->max_id)
2078 fi_args->max_id = device->devid;
2079 }
2080 mutex_unlock(&fs_devices->device_list_mutex);
2081
2082 if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
2083 ret = -EFAULT;
2084
2085 kfree(fi_args);
2086 return ret;
2087}
2088
2089static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
2090{
2091 struct btrfs_ioctl_dev_info_args *di_args;
2092 struct btrfs_device *dev;
2093 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
2094 int ret = 0;
2095 char *s_uuid = NULL;
2096 char empty_uuid[BTRFS_UUID_SIZE] = {0};
2097
2098 if (!capable(CAP_SYS_ADMIN))
2099 return -EPERM;
2100
2101 di_args = memdup_user(arg, sizeof(*di_args));
2102 if (IS_ERR(di_args))
2103 return PTR_ERR(di_args);
2104
2105 if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0)
2106 s_uuid = di_args->uuid;
2107
2108 mutex_lock(&fs_devices->device_list_mutex);
2109 dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
2110 mutex_unlock(&fs_devices->device_list_mutex);
2111
2112 if (!dev) {
2113 ret = -ENODEV;
2114 goto out;
2115 }
2116
2117 di_args->devid = dev->devid;
2118 di_args->bytes_used = dev->bytes_used;
2119 di_args->total_bytes = dev->total_bytes;
2120 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2121 strncpy(di_args->path, dev->name, sizeof(di_args->path));
2122
2123out:
2124 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
2125 ret = -EFAULT;
2126
2127 kfree(di_args);
2128 return ret;
2129}
2130
1431static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, 2131static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1432 u64 off, u64 olen, u64 destoff) 2132 u64 off, u64 olen, u64 destoff)
1433{ 2133{
@@ -1461,6 +2161,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1461 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) 2161 if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
1462 return -EINVAL; 2162 return -EINVAL;
1463 2163
2164 if (btrfs_root_readonly(root))
2165 return -EROFS;
2166
1464 ret = mnt_want_write(file->f_path.mnt); 2167 ret = mnt_want_write(file->f_path.mnt);
1465 if (ret) 2168 if (ret)
1466 return ret; 2169 return ret;
@@ -1502,11 +2205,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1502 path->reada = 2; 2205 path->reada = 2;
1503 2206
1504 if (inode < src) { 2207 if (inode < src) {
1505 mutex_lock(&inode->i_mutex); 2208 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
1506 mutex_lock(&src->i_mutex); 2209 mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
1507 } else { 2210 } else {
1508 mutex_lock(&src->i_mutex); 2211 mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
1509 mutex_lock(&inode->i_mutex); 2212 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
1510 } 2213 }
1511 2214
1512 /* determine range to clone */ 2215 /* determine range to clone */
@@ -1517,12 +2220,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1517 olen = len = src->i_size - off; 2220 olen = len = src->i_size - off;
1518 /* if we extend to eof, continue to block boundary */ 2221 /* if we extend to eof, continue to block boundary */
1519 if (off + len == src->i_size) 2222 if (off + len == src->i_size)
1520 len = ((src->i_size + bs-1) & ~(bs-1)) 2223 len = ALIGN(src->i_size, bs) - off;
1521 - off;
1522 2224
1523 /* verify the end result is block aligned */ 2225 /* verify the end result is block aligned */
1524 if ((off & (bs-1)) || 2226 if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
1525 ((off + len) & (bs-1))) 2227 !IS_ALIGNED(destoff, bs))
1526 goto out_unlock; 2228 goto out_unlock;
1527 2229
1528 /* do any pending delalloc/csum calc on src, one way or 2230 /* do any pending delalloc/csum calc on src, one way or
@@ -1530,17 +2232,19 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1530 while (1) { 2232 while (1) {
1531 struct btrfs_ordered_extent *ordered; 2233 struct btrfs_ordered_extent *ordered;
1532 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 2234 lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1533 ordered = btrfs_lookup_first_ordered_extent(inode, off+len); 2235 ordered = btrfs_lookup_first_ordered_extent(src, off+len);
1534 if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) 2236 if (!ordered &&
2237 !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
2238 EXTENT_DELALLOC, 0, NULL))
1535 break; 2239 break;
1536 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 2240 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1537 if (ordered) 2241 if (ordered)
1538 btrfs_put_ordered_extent(ordered); 2242 btrfs_put_ordered_extent(ordered);
1539 btrfs_wait_ordered_range(src, off, off+len); 2243 btrfs_wait_ordered_range(src, off, len);
1540 } 2244 }
1541 2245
1542 /* clone data */ 2246 /* clone data */
1543 key.objectid = src->i_ino; 2247 key.objectid = btrfs_ino(src);
1544 key.type = BTRFS_EXTENT_DATA_KEY; 2248 key.type = BTRFS_EXTENT_DATA_KEY;
1545 key.offset = 0; 2249 key.offset = 0;
1546 2250
@@ -1567,7 +2271,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1567 2271
1568 btrfs_item_key_to_cpu(leaf, &key, slot); 2272 btrfs_item_key_to_cpu(leaf, &key, slot);
1569 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || 2273 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
1570 key.objectid != src->i_ino) 2274 key.objectid != btrfs_ino(src))
1571 break; 2275 break;
1572 2276
1573 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { 2277 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
@@ -1603,15 +2307,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1603 datal = btrfs_file_extent_ram_bytes(leaf, 2307 datal = btrfs_file_extent_ram_bytes(leaf,
1604 extent); 2308 extent);
1605 } 2309 }
1606 btrfs_release_path(root, path); 2310 btrfs_release_path(path);
1607 2311
1608 if (key.offset + datal < off || 2312 if (key.offset + datal <= off ||
1609 key.offset >= off+len) 2313 key.offset >= off+len)
1610 goto next; 2314 goto next;
1611 2315
1612 memcpy(&new_key, &key, sizeof(new_key)); 2316 memcpy(&new_key, &key, sizeof(new_key));
1613 new_key.objectid = inode->i_ino; 2317 new_key.objectid = btrfs_ino(inode);
1614 new_key.offset = key.offset + destoff - off; 2318 if (off <= key.offset)
2319 new_key.offset = key.offset + destoff - off;
2320 else
2321 new_key.offset = destoff;
1615 2322
1616 trans = btrfs_start_transaction(root, 1); 2323 trans = btrfs_start_transaction(root, 1);
1617 if (IS_ERR(trans)) { 2324 if (IS_ERR(trans)) {
@@ -1661,7 +2368,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1661 ret = btrfs_inc_extent_ref(trans, root, 2368 ret = btrfs_inc_extent_ref(trans, root,
1662 disko, diskl, 0, 2369 disko, diskl, 0,
1663 root->root_key.objectid, 2370 root->root_key.objectid,
1664 inode->i_ino, 2371 btrfs_ino(inode),
1665 new_key.offset - datao); 2372 new_key.offset - datao);
1666 BUG_ON(ret); 2373 BUG_ON(ret);
1667 } 2374 }
@@ -1710,7 +2417,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1710 } 2417 }
1711 2418
1712 btrfs_mark_buffer_dirty(leaf); 2419 btrfs_mark_buffer_dirty(leaf);
1713 btrfs_release_path(root, path); 2420 btrfs_release_path(path);
1714 2421
1715 inode->i_mtime = inode->i_ctime = CURRENT_TIME; 2422 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
1716 2423
@@ -1720,8 +2427,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1720 * but shouldn't round up the file size 2427 * but shouldn't round up the file size
1721 */ 2428 */
1722 endoff = new_key.offset + datal; 2429 endoff = new_key.offset + datal;
1723 if (endoff > off+olen) 2430 if (endoff > destoff+olen)
1724 endoff = off+olen; 2431 endoff = destoff+olen;
1725 if (endoff > inode->i_size) 2432 if (endoff > inode->i_size)
1726 btrfs_i_size_write(inode, endoff); 2433 btrfs_i_size_write(inode, endoff);
1727 2434
@@ -1731,12 +2438,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
1731 btrfs_end_transaction(trans, root); 2438 btrfs_end_transaction(trans, root);
1732 } 2439 }
1733next: 2440next:
1734 btrfs_release_path(root, path); 2441 btrfs_release_path(path);
1735 key.offset++; 2442 key.offset++;
1736 } 2443 }
1737 ret = 0; 2444 ret = 0;
1738out: 2445out:
1739 btrfs_release_path(root, path); 2446 btrfs_release_path(path);
1740 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); 2447 unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
1741out_unlock: 2448out_unlock:
1742 mutex_unlock(&src->i_mutex); 2449 mutex_unlock(&src->i_mutex);
@@ -1781,26 +2488,26 @@ static long btrfs_ioctl_trans_start(struct file *file)
1781 if (file->private_data) 2488 if (file->private_data)
1782 goto out; 2489 goto out;
1783 2490
2491 ret = -EROFS;
2492 if (btrfs_root_readonly(root))
2493 goto out;
2494
1784 ret = mnt_want_write(file->f_path.mnt); 2495 ret = mnt_want_write(file->f_path.mnt);
1785 if (ret) 2496 if (ret)
1786 goto out; 2497 goto out;
1787 2498
1788 mutex_lock(&root->fs_info->trans_mutex); 2499 atomic_inc(&root->fs_info->open_ioctl_trans);
1789 root->fs_info->open_ioctl_trans++;
1790 mutex_unlock(&root->fs_info->trans_mutex);
1791 2500
1792 ret = -ENOMEM; 2501 ret = -ENOMEM;
1793 trans = btrfs_start_ioctl_transaction(root, 0); 2502 trans = btrfs_start_ioctl_transaction(root);
1794 if (!trans) 2503 if (IS_ERR(trans))
1795 goto out_drop; 2504 goto out_drop;
1796 2505
1797 file->private_data = trans; 2506 file->private_data = trans;
1798 return 0; 2507 return 0;
1799 2508
1800out_drop: 2509out_drop:
1801 mutex_lock(&root->fs_info->trans_mutex); 2510 atomic_dec(&root->fs_info->open_ioctl_trans);
1802 root->fs_info->open_ioctl_trans--;
1803 mutex_unlock(&root->fs_info->trans_mutex);
1804 mnt_drop_write(file->f_path.mnt); 2511 mnt_drop_write(file->f_path.mnt);
1805out: 2512out:
1806 return ret; 2513 return ret;
@@ -1847,9 +2554,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1847 path->leave_spinning = 1; 2554 path->leave_spinning = 1;
1848 2555
1849 trans = btrfs_start_transaction(root, 1); 2556 trans = btrfs_start_transaction(root, 1);
1850 if (!trans) { 2557 if (IS_ERR(trans)) {
1851 btrfs_free_path(path); 2558 btrfs_free_path(path);
1852 return -ENOMEM; 2559 return PTR_ERR(trans);
1853 } 2560 }
1854 2561
1855 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2562 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -1879,35 +2586,80 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1879 return 0; 2586 return 0;
1880} 2587}
1881 2588
2589static void get_block_group_info(struct list_head *groups_list,
2590 struct btrfs_ioctl_space_info *space)
2591{
2592 struct btrfs_block_group_cache *block_group;
2593
2594 space->total_bytes = 0;
2595 space->used_bytes = 0;
2596 space->flags = 0;
2597 list_for_each_entry(block_group, groups_list, list) {
2598 space->flags = block_group->flags;
2599 space->total_bytes += block_group->key.offset;
2600 space->used_bytes +=
2601 btrfs_block_group_used(&block_group->item);
2602 }
2603}
2604
1882long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) 2605long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1883{ 2606{
1884 struct btrfs_ioctl_space_args space_args; 2607 struct btrfs_ioctl_space_args space_args;
1885 struct btrfs_ioctl_space_info space; 2608 struct btrfs_ioctl_space_info space;
1886 struct btrfs_ioctl_space_info *dest; 2609 struct btrfs_ioctl_space_info *dest;
1887 struct btrfs_ioctl_space_info *dest_orig; 2610 struct btrfs_ioctl_space_info *dest_orig;
1888 struct btrfs_ioctl_space_info *user_dest; 2611 struct btrfs_ioctl_space_info __user *user_dest;
1889 struct btrfs_space_info *info; 2612 struct btrfs_space_info *info;
2613 u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
2614 BTRFS_BLOCK_GROUP_SYSTEM,
2615 BTRFS_BLOCK_GROUP_METADATA,
2616 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
2617 int num_types = 4;
1890 int alloc_size; 2618 int alloc_size;
1891 int ret = 0; 2619 int ret = 0;
1892 int slot_count = 0; 2620 u64 slot_count = 0;
2621 int i, c;
1893 2622
1894 if (copy_from_user(&space_args, 2623 if (copy_from_user(&space_args,
1895 (struct btrfs_ioctl_space_args __user *)arg, 2624 (struct btrfs_ioctl_space_args __user *)arg,
1896 sizeof(space_args))) 2625 sizeof(space_args)))
1897 return -EFAULT; 2626 return -EFAULT;
1898 2627
1899 /* first we count slots */ 2628 for (i = 0; i < num_types; i++) {
1900 rcu_read_lock(); 2629 struct btrfs_space_info *tmp;
1901 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) 2630
1902 slot_count++; 2631 info = NULL;
1903 rcu_read_unlock(); 2632 rcu_read_lock();
2633 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
2634 list) {
2635 if (tmp->flags == types[i]) {
2636 info = tmp;
2637 break;
2638 }
2639 }
2640 rcu_read_unlock();
2641
2642 if (!info)
2643 continue;
2644
2645 down_read(&info->groups_sem);
2646 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2647 if (!list_empty(&info->block_groups[c]))
2648 slot_count++;
2649 }
2650 up_read(&info->groups_sem);
2651 }
1904 2652
1905 /* space_slots == 0 means they are asking for a count */ 2653 /* space_slots == 0 means they are asking for a count */
1906 if (space_args.space_slots == 0) { 2654 if (space_args.space_slots == 0) {
1907 space_args.total_spaces = slot_count; 2655 space_args.total_spaces = slot_count;
1908 goto out; 2656 goto out;
1909 } 2657 }
2658
2659 slot_count = min_t(u64, space_args.space_slots, slot_count);
2660
1910 alloc_size = sizeof(*dest) * slot_count; 2661 alloc_size = sizeof(*dest) * slot_count;
2662
1911 /* we generally have at most 6 or so space infos, one for each raid 2663 /* we generally have at most 6 or so space infos, one for each raid
1912 * level. So, a whole page should be more than enough for everyone 2664 * level. So, a whole page should be more than enough for everyone
1913 */ 2665 */
@@ -1921,27 +2673,40 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1921 dest_orig = dest; 2673 dest_orig = dest;
1922 2674
1923 /* now we have a buffer to copy into */ 2675 /* now we have a buffer to copy into */
1924 rcu_read_lock(); 2676 for (i = 0; i < num_types; i++) {
1925 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) { 2677 struct btrfs_space_info *tmp;
1926 /* make sure we don't copy more than we allocated
1927 * in our buffer
1928 */
1929 if (slot_count == 0)
1930 break;
1931 slot_count--;
1932 2678
1933 /* make sure userland has enough room in their buffer */ 2679 if (!slot_count)
1934 if (space_args.total_spaces >= space_args.space_slots)
1935 break; 2680 break;
1936 2681
1937 space.flags = info->flags; 2682 info = NULL;
1938 space.total_bytes = info->total_bytes; 2683 rcu_read_lock();
1939 space.used_bytes = info->bytes_used; 2684 list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
1940 memcpy(dest, &space, sizeof(space)); 2685 list) {
1941 dest++; 2686 if (tmp->flags == types[i]) {
1942 space_args.total_spaces++; 2687 info = tmp;
2688 break;
2689 }
2690 }
2691 rcu_read_unlock();
2692
2693 if (!info)
2694 continue;
2695 down_read(&info->groups_sem);
2696 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2697 if (!list_empty(&info->block_groups[c])) {
2698 get_block_group_info(&info->block_groups[c],
2699 &space);
2700 memcpy(dest, &space, sizeof(space));
2701 dest++;
2702 space_args.total_spaces++;
2703 slot_count--;
2704 }
2705 if (!slot_count)
2706 break;
2707 }
2708 up_read(&info->groups_sem);
1943 } 2709 }
1944 rcu_read_unlock();
1945 2710
1946 user_dest = (struct btrfs_ioctl_space_info *) 2711 user_dest = (struct btrfs_ioctl_space_info *)
1947 (arg + sizeof(struct btrfs_ioctl_space_args)); 2712 (arg + sizeof(struct btrfs_ioctl_space_args));
@@ -1976,14 +2741,101 @@ long btrfs_ioctl_trans_end(struct file *file)
1976 2741
1977 btrfs_end_transaction(trans, root); 2742 btrfs_end_transaction(trans, root);
1978 2743
1979 mutex_lock(&root->fs_info->trans_mutex); 2744 atomic_dec(&root->fs_info->open_ioctl_trans);
1980 root->fs_info->open_ioctl_trans--;
1981 mutex_unlock(&root->fs_info->trans_mutex);
1982 2745
1983 mnt_drop_write(file->f_path.mnt); 2746 mnt_drop_write(file->f_path.mnt);
1984 return 0; 2747 return 0;
1985} 2748}
1986 2749
2750static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
2751{
2752 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2753 struct btrfs_trans_handle *trans;
2754 u64 transid;
2755 int ret;
2756
2757 trans = btrfs_start_transaction(root, 0);
2758 if (IS_ERR(trans))
2759 return PTR_ERR(trans);
2760 transid = trans->transid;
2761 ret = btrfs_commit_transaction_async(trans, root, 0);
2762 if (ret) {
2763 btrfs_end_transaction(trans, root);
2764 return ret;
2765 }
2766
2767 if (argp)
2768 if (copy_to_user(argp, &transid, sizeof(transid)))
2769 return -EFAULT;
2770 return 0;
2771}
2772
2773static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
2774{
2775 struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
2776 u64 transid;
2777
2778 if (argp) {
2779 if (copy_from_user(&transid, argp, sizeof(transid)))
2780 return -EFAULT;
2781 } else {
2782 transid = 0; /* current trans */
2783 }
2784 return btrfs_wait_for_commit(root, transid);
2785}
2786
2787static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
2788{
2789 int ret;
2790 struct btrfs_ioctl_scrub_args *sa;
2791
2792 if (!capable(CAP_SYS_ADMIN))
2793 return -EPERM;
2794
2795 sa = memdup_user(arg, sizeof(*sa));
2796 if (IS_ERR(sa))
2797 return PTR_ERR(sa);
2798
2799 ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
2800 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
2801
2802 if (copy_to_user(arg, sa, sizeof(*sa)))
2803 ret = -EFAULT;
2804
2805 kfree(sa);
2806 return ret;
2807}
2808
2809static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
2810{
2811 if (!capable(CAP_SYS_ADMIN))
2812 return -EPERM;
2813
2814 return btrfs_scrub_cancel(root);
2815}
2816
2817static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2818 void __user *arg)
2819{
2820 struct btrfs_ioctl_scrub_args *sa;
2821 int ret;
2822
2823 if (!capable(CAP_SYS_ADMIN))
2824 return -EPERM;
2825
2826 sa = memdup_user(arg, sizeof(*sa));
2827 if (IS_ERR(sa))
2828 return PTR_ERR(sa);
2829
2830 ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
2831
2832 if (copy_to_user(arg, sa, sizeof(*sa)))
2833 ret = -EFAULT;
2834
2835 kfree(sa);
2836 return ret;
2837}
2838
1987long btrfs_ioctl(struct file *file, unsigned int 2839long btrfs_ioctl(struct file *file, unsigned int
1988 cmd, unsigned long arg) 2840 cmd, unsigned long arg)
1989{ 2841{
@@ -1997,12 +2849,20 @@ long btrfs_ioctl(struct file *file, unsigned int
1997 return btrfs_ioctl_setflags(file, argp); 2849 return btrfs_ioctl_setflags(file, argp);
1998 case FS_IOC_GETVERSION: 2850 case FS_IOC_GETVERSION:
1999 return btrfs_ioctl_getversion(file, argp); 2851 return btrfs_ioctl_getversion(file, argp);
2852 case FITRIM:
2853 return btrfs_ioctl_fitrim(file, argp);
2000 case BTRFS_IOC_SNAP_CREATE: 2854 case BTRFS_IOC_SNAP_CREATE:
2001 return btrfs_ioctl_snap_create(file, argp, 0); 2855 return btrfs_ioctl_snap_create(file, argp, 0);
2856 case BTRFS_IOC_SNAP_CREATE_V2:
2857 return btrfs_ioctl_snap_create_v2(file, argp, 0);
2002 case BTRFS_IOC_SUBVOL_CREATE: 2858 case BTRFS_IOC_SUBVOL_CREATE:
2003 return btrfs_ioctl_snap_create(file, argp, 1); 2859 return btrfs_ioctl_snap_create(file, argp, 1);
2004 case BTRFS_IOC_SNAP_DESTROY: 2860 case BTRFS_IOC_SNAP_DESTROY:
2005 return btrfs_ioctl_snap_destroy(file, argp); 2861 return btrfs_ioctl_snap_destroy(file, argp);
2862 case BTRFS_IOC_SUBVOL_GETFLAGS:
2863 return btrfs_ioctl_subvol_getflags(file, argp);
2864 case BTRFS_IOC_SUBVOL_SETFLAGS:
2865 return btrfs_ioctl_subvol_setflags(file, argp);
2006 case BTRFS_IOC_DEFAULT_SUBVOL: 2866 case BTRFS_IOC_DEFAULT_SUBVOL:
2007 return btrfs_ioctl_default_subvol(file, argp); 2867 return btrfs_ioctl_default_subvol(file, argp);
2008 case BTRFS_IOC_DEFRAG: 2868 case BTRFS_IOC_DEFRAG:
@@ -2015,6 +2875,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2015 return btrfs_ioctl_add_dev(root, argp); 2875 return btrfs_ioctl_add_dev(root, argp);
2016 case BTRFS_IOC_RM_DEV: 2876 case BTRFS_IOC_RM_DEV:
2017 return btrfs_ioctl_rm_dev(root, argp); 2877 return btrfs_ioctl_rm_dev(root, argp);
2878 case BTRFS_IOC_FS_INFO:
2879 return btrfs_ioctl_fs_info(root, argp);
2880 case BTRFS_IOC_DEV_INFO:
2881 return btrfs_ioctl_dev_info(root, argp);
2018 case BTRFS_IOC_BALANCE: 2882 case BTRFS_IOC_BALANCE:
2019 return btrfs_balance(root->fs_info->dev_root); 2883 return btrfs_balance(root->fs_info->dev_root);
2020 case BTRFS_IOC_CLONE: 2884 case BTRFS_IOC_CLONE:
@@ -2034,6 +2898,16 @@ long btrfs_ioctl(struct file *file, unsigned int
2034 case BTRFS_IOC_SYNC: 2898 case BTRFS_IOC_SYNC:
2035 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2899 btrfs_sync_fs(file->f_dentry->d_sb, 1);
2036 return 0; 2900 return 0;
2901 case BTRFS_IOC_START_SYNC:
2902 return btrfs_ioctl_start_sync(file, argp);
2903 case BTRFS_IOC_WAIT_SYNC:
2904 return btrfs_ioctl_wait_sync(file, argp);
2905 case BTRFS_IOC_SCRUB:
2906 return btrfs_ioctl_scrub(root, argp);
2907 case BTRFS_IOC_SCRUB_CANCEL:
2908 return btrfs_ioctl_scrub_cancel(root, argp);
2909 case BTRFS_IOC_SCRUB_PROGRESS:
2910 return btrfs_ioctl_scrub_progress(root, argp);
2037 } 2911 }
2038 2912
2039 return -ENOTTY; 2913 return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 424694aa517f..ad1ea789fcb4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,14 +22,93 @@
22 22
23#define BTRFS_IOCTL_MAGIC 0x94 23#define BTRFS_IOCTL_MAGIC 0x94
24#define BTRFS_VOL_NAME_MAX 255 24#define BTRFS_VOL_NAME_MAX 255
25#define BTRFS_PATH_NAME_MAX 4087
26 25
27/* this should be 4k */ 26/* this should be 4k */
27#define BTRFS_PATH_NAME_MAX 4087
28struct btrfs_ioctl_vol_args { 28struct btrfs_ioctl_vol_args {
29 __s64 fd; 29 __s64 fd;
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_SUBVOL_CREATE_ASYNC (1ULL << 0)
34#define BTRFS_SUBVOL_RDONLY (1ULL << 1)
35#define BTRFS_FSID_SIZE 16
36#define BTRFS_UUID_SIZE 16
37
38#define BTRFS_SUBVOL_NAME_MAX 4039
39struct btrfs_ioctl_vol_args_v2 {
40 __s64 fd;
41 __u64 transid;
42 __u64 flags;
43 __u64 unused[4];
44 char name[BTRFS_SUBVOL_NAME_MAX + 1];
45};
46
47/*
48 * structure to report errors and progress to userspace, either as a
49 * result of a finished scrub, a canceled scrub or a progress inquiry
50 */
51struct btrfs_scrub_progress {
52 __u64 data_extents_scrubbed; /* # of data extents scrubbed */
53 __u64 tree_extents_scrubbed; /* # of tree extents scrubbed */
54 __u64 data_bytes_scrubbed; /* # of data bytes scrubbed */
55 __u64 tree_bytes_scrubbed; /* # of tree bytes scrubbed */
56 __u64 read_errors; /* # of read errors encountered (EIO) */
57 __u64 csum_errors; /* # of failed csum checks */
58 __u64 verify_errors; /* # of occurences, where the metadata
59 * of a tree block did not match the
60 * expected values, like generation or
61 * logical */
62 __u64 no_csum; /* # of 4k data block for which no csum
63 * is present, probably the result of
64 * data written with nodatasum */
65 __u64 csum_discards; /* # of csum for which no data was found
66 * in the extent tree. */
67 __u64 super_errors; /* # of bad super blocks encountered */
68 __u64 malloc_errors; /* # of internal kmalloc errors. These
69 * will likely cause an incomplete
70 * scrub */
71 __u64 uncorrectable_errors; /* # of errors where either no intact
72 * copy was found or the writeback
73 * failed */
74 __u64 corrected_errors; /* # of errors corrected */
75 __u64 last_physical; /* last physical address scrubbed. In
76 * case a scrub was aborted, this can
77 * be used to restart the scrub */
78 __u64 unverified_errors; /* # of occurences where a read for a
79 * full (64k) bio failed, but the re-
80 * check succeeded for each 4k piece.
81 * Intermittent error. */
82};
83
84#define BTRFS_SCRUB_READONLY 1
85struct btrfs_ioctl_scrub_args {
86 __u64 devid; /* in */
87 __u64 start; /* in */
88 __u64 end; /* in */
89 __u64 flags; /* in */
90 struct btrfs_scrub_progress progress; /* out */
91 /* pad to 1k */
92 __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
93};
94
95#define BTRFS_DEVICE_PATH_NAME_MAX 1024
96struct btrfs_ioctl_dev_info_args {
97 __u64 devid; /* in/out */
98 __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */
99 __u64 bytes_used; /* out */
100 __u64 total_bytes; /* out */
101 __u64 unused[379]; /* pad to 4k */
102 __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */
103};
104
105struct btrfs_ioctl_fs_info_args {
106 __u64 max_id; /* out */
107 __u64 num_devices; /* out */
108 __u8 fsid[BTRFS_FSID_SIZE]; /* out */
109 __u64 reserved[124]; /* pad to 1k */
110};
111
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080 112#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args { 113struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid; 114 __u64 treeid;
@@ -102,30 +181,6 @@ struct btrfs_ioctl_clone_range_args {
102#define BTRFS_DEFRAG_RANGE_COMPRESS 1 181#define BTRFS_DEFRAG_RANGE_COMPRESS 1
103#define BTRFS_DEFRAG_RANGE_START_IO 2 182#define BTRFS_DEFRAG_RANGE_START_IO 2
104 183
105struct btrfs_ioctl_defrag_range_args {
106 /* start of the defrag operation */
107 __u64 start;
108
109 /* number of bytes to defrag, use (u64)-1 to say all */
110 __u64 len;
111
112 /*
113 * flags for the operation, which can include turning
114 * on compression for this one defrag
115 */
116 __u64 flags;
117
118 /*
119 * any extent bigger than this will be considered
120 * already defragged. Use 0 to take the kernel default
121 * Use 1 to say every single extent must be rewritten
122 */
123 __u32 extent_thresh;
124
125 /* spare for later */
126 __u32 unused[5];
127};
128
129struct btrfs_ioctl_space_info { 184struct btrfs_ioctl_space_info {
130 __u64 flags; 185 __u64 flags;
131 __u64 total_bytes; 186 __u64 total_bytes;
@@ -178,4 +233,19 @@ struct btrfs_ioctl_space_args {
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64) 233#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \ 234#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args) 235 struct btrfs_ioctl_space_args)
236#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
237#define BTRFS_IOC_WAIT_SYNC _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
238#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
239 struct btrfs_ioctl_vol_args_v2)
240#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
241#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
242#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
243 struct btrfs_ioctl_scrub_args)
244#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
245#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
246 struct btrfs_ioctl_scrub_args)
247#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
248 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args)
181#endif 251#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6151f2ea38bb..66fa43dc3f0f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -185,31 +185,6 @@ sleep:
185 return 0; 185 return 0;
186} 186}
187 187
188/*
189 * Very quick trylock, this does not spin or schedule. It returns
190 * 1 with the spinlock held if it was able to take the lock, or it
191 * returns zero if it was unable to take the lock.
192 *
193 * After this call, scheduling is not safe without first calling
194 * btrfs_set_lock_blocking()
195 */
196int btrfs_try_tree_lock(struct extent_buffer *eb)
197{
198 if (spin_trylock(&eb->lock)) {
199 if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
200 /*
201 * we've got the spinlock, but the real owner is
202 * blocking. Drop the spinlock and return failure
203 */
204 spin_unlock(&eb->lock);
205 return 0;
206 }
207 return 1;
208 }
209 /* someone else has the spinlock giveup */
210 return 0;
211}
212
213int btrfs_tree_unlock(struct extent_buffer *eb) 188int btrfs_tree_unlock(struct extent_buffer *eb)
214{ 189{
215 /* 190 /*
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6c4ce457168c..5c33a560a2f1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,8 +21,6 @@
21 21
22int btrfs_tree_lock(struct extent_buffer *eb); 22int btrfs_tree_lock(struct extent_buffer *eb);
23int btrfs_tree_unlock(struct extent_buffer *eb); 23int btrfs_tree_unlock(struct extent_buffer *eb);
24
25int btrfs_try_tree_lock(struct extent_buffer *eb);
26int btrfs_try_spin_lock(struct extent_buffer *eb); 24int btrfs_try_spin_lock(struct extent_buffer *eb);
27 25
28void btrfs_set_lock_blocking(struct extent_buffer *eb); 26void btrfs_set_lock_blocking(struct extent_buffer *eb);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..a178f5ebea78
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,427 @@
1/*
2 * Copyright (C) 2008 Oracle. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/kernel.h>
20#include <linux/slab.h>
21#include <linux/vmalloc.h>
22#include <linux/init.h>
23#include <linux/err.h>
24#include <linux/sched.h>
25#include <linux/pagemap.h>
26#include <linux/bio.h>
27#include <linux/lzo.h>
28#include "compression.h"
29
30#define LZO_LEN 4
31
32struct workspace {
33 void *mem;
34 void *buf; /* where compressed data goes */
35 void *cbuf; /* where decompressed data goes */
36 struct list_head list;
37};
38
39static void lzo_free_workspace(struct list_head *ws)
40{
41 struct workspace *workspace = list_entry(ws, struct workspace, list);
42
43 vfree(workspace->buf);
44 vfree(workspace->cbuf);
45 vfree(workspace->mem);
46 kfree(workspace);
47}
48
49static struct list_head *lzo_alloc_workspace(void)
50{
51 struct workspace *workspace;
52
53 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
54 if (!workspace)
55 return ERR_PTR(-ENOMEM);
56
57 workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
58 workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
59 workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
60 if (!workspace->mem || !workspace->buf || !workspace->cbuf)
61 goto fail;
62
63 INIT_LIST_HEAD(&workspace->list);
64
65 return &workspace->list;
66fail:
67 lzo_free_workspace(&workspace->list);
68 return ERR_PTR(-ENOMEM);
69}
70
71static inline void write_compress_length(char *buf, size_t len)
72{
73 __le32 dlen;
74
75 dlen = cpu_to_le32(len);
76 memcpy(buf, &dlen, LZO_LEN);
77}
78
79static inline size_t read_compress_length(char *buf)
80{
81 __le32 dlen;
82
83 memcpy(&dlen, buf, LZO_LEN);
84 return le32_to_cpu(dlen);
85}
86
87static int lzo_compress_pages(struct list_head *ws,
88 struct address_space *mapping,
89 u64 start, unsigned long len,
90 struct page **pages,
91 unsigned long nr_dest_pages,
92 unsigned long *out_pages,
93 unsigned long *total_in,
94 unsigned long *total_out,
95 unsigned long max_out)
96{
97 struct workspace *workspace = list_entry(ws, struct workspace, list);
98 int ret = 0;
99 char *data_in;
100 char *cpage_out;
101 int nr_pages = 0;
102 struct page *in_page = NULL;
103 struct page *out_page = NULL;
104 unsigned long bytes_left;
105
106 size_t in_len;
107 size_t out_len;
108 char *buf;
109 unsigned long tot_in = 0;
110 unsigned long tot_out = 0;
111 unsigned long pg_bytes_left;
112 unsigned long out_offset;
113 unsigned long bytes;
114
115 *out_pages = 0;
116 *total_out = 0;
117 *total_in = 0;
118
119 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
120 data_in = kmap(in_page);
121
122 /*
123 * store the size of all chunks of compressed data in
124 * the first 4 bytes
125 */
126 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
127 if (out_page == NULL) {
128 ret = -ENOMEM;
129 goto out;
130 }
131 cpage_out = kmap(out_page);
132 out_offset = LZO_LEN;
133 tot_out = LZO_LEN;
134 pages[0] = out_page;
135 nr_pages = 1;
136 pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
137
138 /* compress at most one page of data each time */
139 in_len = min(len, PAGE_CACHE_SIZE);
140 while (tot_in < len) {
141 ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
142 &out_len, workspace->mem);
143 if (ret != LZO_E_OK) {
144 printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
145 ret);
146 ret = -1;
147 goto out;
148 }
149
150 /* store the size of this chunk of compressed data */
151 write_compress_length(cpage_out + out_offset, out_len);
152 tot_out += LZO_LEN;
153 out_offset += LZO_LEN;
154 pg_bytes_left -= LZO_LEN;
155
156 tot_in += in_len;
157 tot_out += out_len;
158
159 /* copy bytes from the working buffer into the pages */
160 buf = workspace->cbuf;
161 while (out_len) {
162 bytes = min_t(unsigned long, pg_bytes_left, out_len);
163
164 memcpy(cpage_out + out_offset, buf, bytes);
165
166 out_len -= bytes;
167 pg_bytes_left -= bytes;
168 buf += bytes;
169 out_offset += bytes;
170
171 /*
172 * we need another page for writing out.
173 *
174 * Note if there's less than 4 bytes left, we just
175 * skip to a new page.
176 */
177 if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
178 pg_bytes_left == 0) {
179 if (pg_bytes_left) {
180 memset(cpage_out + out_offset, 0,
181 pg_bytes_left);
182 tot_out += pg_bytes_left;
183 }
184
185 /* we're done, don't allocate new page */
186 if (out_len == 0 && tot_in >= len)
187 break;
188
189 kunmap(out_page);
190 if (nr_pages == nr_dest_pages) {
191 out_page = NULL;
192 ret = -1;
193 goto out;
194 }
195
196 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
197 if (out_page == NULL) {
198 ret = -ENOMEM;
199 goto out;
200 }
201 cpage_out = kmap(out_page);
202 pages[nr_pages++] = out_page;
203
204 pg_bytes_left = PAGE_CACHE_SIZE;
205 out_offset = 0;
206 }
207 }
208
209 /* we're making it bigger, give up */
210 if (tot_in > 8192 && tot_in < tot_out)
211 goto out;
212
213 /* we're all done */
214 if (tot_in >= len)
215 break;
216
217 if (tot_out > max_out)
218 break;
219
220 bytes_left = len - tot_in;
221 kunmap(in_page);
222 page_cache_release(in_page);
223
224 start += PAGE_CACHE_SIZE;
225 in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
226 data_in = kmap(in_page);
227 in_len = min(bytes_left, PAGE_CACHE_SIZE);
228 }
229
230 if (tot_out > tot_in)
231 goto out;
232
233 /* store the size of all chunks of compressed data */
234 cpage_out = kmap(pages[0]);
235 write_compress_length(cpage_out, tot_out);
236
237 kunmap(pages[0]);
238
239 ret = 0;
240 *total_out = tot_out;
241 *total_in = tot_in;
242out:
243 *out_pages = nr_pages;
244 if (out_page)
245 kunmap(out_page);
246
247 if (in_page) {
248 kunmap(in_page);
249 page_cache_release(in_page);
250 }
251
252 return ret;
253}
254
255static int lzo_decompress_biovec(struct list_head *ws,
256 struct page **pages_in,
257 u64 disk_start,
258 struct bio_vec *bvec,
259 int vcnt,
260 size_t srclen)
261{
262 struct workspace *workspace = list_entry(ws, struct workspace, list);
263 int ret = 0, ret2;
264 char *data_in;
265 unsigned long page_in_index = 0;
266 unsigned long page_out_index = 0;
267 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
268 PAGE_CACHE_SIZE;
269 unsigned long buf_start;
270 unsigned long buf_offset = 0;
271 unsigned long bytes;
272 unsigned long working_bytes;
273 unsigned long pg_offset;
274
275 size_t in_len;
276 size_t out_len;
277 unsigned long in_offset;
278 unsigned long in_page_bytes_left;
279 unsigned long tot_in;
280 unsigned long tot_out;
281 unsigned long tot_len;
282 char *buf;
283 bool may_late_unmap, need_unmap;
284
285 data_in = kmap(pages_in[0]);
286 tot_len = read_compress_length(data_in);
287
288 tot_in = LZO_LEN;
289 in_offset = LZO_LEN;
290 tot_len = min_t(size_t, srclen, tot_len);
291 in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
292
293 tot_out = 0;
294 pg_offset = 0;
295
296 while (tot_in < tot_len) {
297 in_len = read_compress_length(data_in + in_offset);
298 in_page_bytes_left -= LZO_LEN;
299 in_offset += LZO_LEN;
300 tot_in += LZO_LEN;
301
302 tot_in += in_len;
303 working_bytes = in_len;
304 may_late_unmap = need_unmap = false;
305
306 /* fast path: avoid using the working buffer */
307 if (in_page_bytes_left >= in_len) {
308 buf = data_in + in_offset;
309 bytes = in_len;
310 may_late_unmap = true;
311 goto cont;
312 }
313
314 /* copy bytes from the pages into the working buffer */
315 buf = workspace->cbuf;
316 buf_offset = 0;
317 while (working_bytes) {
318 bytes = min(working_bytes, in_page_bytes_left);
319
320 memcpy(buf + buf_offset, data_in + in_offset, bytes);
321 buf_offset += bytes;
322cont:
323 working_bytes -= bytes;
324 in_page_bytes_left -= bytes;
325 in_offset += bytes;
326
327 /* check if we need to pick another page */
328 if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
329 || in_page_bytes_left == 0) {
330 tot_in += in_page_bytes_left;
331
332 if (working_bytes == 0 && tot_in >= tot_len)
333 break;
334
335 if (page_in_index + 1 >= total_pages_in) {
336 ret = -1;
337 goto done;
338 }
339
340 if (may_late_unmap)
341 need_unmap = true;
342 else
343 kunmap(pages_in[page_in_index]);
344
345 data_in = kmap(pages_in[++page_in_index]);
346
347 in_page_bytes_left = PAGE_CACHE_SIZE;
348 in_offset = 0;
349 }
350 }
351
352 out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
353 ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
354 &out_len);
355 if (need_unmap)
356 kunmap(pages_in[page_in_index - 1]);
357 if (ret != LZO_E_OK) {
358 printk(KERN_WARNING "btrfs decompress failed\n");
359 ret = -1;
360 break;
361 }
362
363 buf_start = tot_out;
364 tot_out += out_len;
365
366 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
367 tot_out, disk_start,
368 bvec, vcnt,
369 &page_out_index, &pg_offset);
370 if (ret2 == 0)
371 break;
372 }
373done:
374 kunmap(pages_in[page_in_index]);
375 return ret;
376}
377
378static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
379 struct page *dest_page,
380 unsigned long start_byte,
381 size_t srclen, size_t destlen)
382{
383 struct workspace *workspace = list_entry(ws, struct workspace, list);
384 size_t in_len;
385 size_t out_len;
386 size_t tot_len;
387 int ret = 0;
388 char *kaddr;
389 unsigned long bytes;
390
391 BUG_ON(srclen < LZO_LEN);
392
393 tot_len = read_compress_length(data_in);
394 data_in += LZO_LEN;
395
396 in_len = read_compress_length(data_in);
397 data_in += LZO_LEN;
398
399 out_len = PAGE_CACHE_SIZE;
400 ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
401 if (ret != LZO_E_OK) {
402 printk(KERN_WARNING "btrfs decompress failed!\n");
403 ret = -1;
404 goto out;
405 }
406
407 if (out_len < start_byte) {
408 ret = -1;
409 goto out;
410 }
411
412 bytes = min_t(unsigned long, destlen, out_len - start_byte);
413
414 kaddr = kmap_atomic(dest_page, KM_USER0);
415 memcpy(kaddr, workspace->buf + start_byte, bytes);
416 kunmap_atomic(kaddr, KM_USER0);
417out:
418 return ret;
419}
420
421struct btrfs_compress_op btrfs_lzo_compress = {
422 .alloc_workspace = lzo_alloc_workspace,
423 .free_workspace = lzo_free_workspace,
424 .compress_pages = lzo_compress_pages,
425 .decompress_biovec = lzo_decompress_biovec,
426 .decompress = lzo_decompress,
427};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e56c72bc5add..a1c940425307 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
141 u64 file_offset) 141 u64 file_offset)
142{ 142{
143 struct rb_root *root = &tree->tree; 143 struct rb_root *root = &tree->tree;
144 struct rb_node *prev; 144 struct rb_node *prev = NULL;
145 struct rb_node *ret; 145 struct rb_node *ret;
146 struct btrfs_ordered_extent *entry; 146 struct btrfs_ordered_extent *entry;
147 147
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
172 */ 172 */
173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 173static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 u64 start, u64 len, u64 disk_len, 174 u64 start, u64 len, u64 disk_len,
175 int type, int dio) 175 int type, int dio, int compress_type)
176{ 176{
177 struct btrfs_ordered_inode_tree *tree; 177 struct btrfs_ordered_inode_tree *tree;
178 struct rb_node *node; 178 struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
189 entry->disk_len = disk_len; 189 entry->disk_len = disk_len;
190 entry->bytes_left = len; 190 entry->bytes_left = len;
191 entry->inode = inode; 191 entry->inode = inode;
192 entry->compress_type = compress_type;
192 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) 193 if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
193 set_bit(type, &entry->flags); 194 set_bit(type, &entry->flags);
194 195
@@ -201,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
201 INIT_LIST_HEAD(&entry->list); 202 INIT_LIST_HEAD(&entry->list);
202 INIT_LIST_HEAD(&entry->root_extent_list); 203 INIT_LIST_HEAD(&entry->root_extent_list);
203 204
205 trace_btrfs_ordered_extent_add(inode, entry);
206
204 spin_lock(&tree->lock); 207 spin_lock(&tree->lock);
205 node = tree_insert(&tree->tree, file_offset, 208 node = tree_insert(&tree->tree, file_offset,
206 &entry->rb_node); 209 &entry->rb_node);
@@ -220,14 +223,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
220 u64 start, u64 len, u64 disk_len, int type) 223 u64 start, u64 len, u64 disk_len, int type)
221{ 224{
222 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 225 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
223 disk_len, type, 0); 226 disk_len, type, 0,
227 BTRFS_COMPRESS_NONE);
224} 228}
225 229
226int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 230int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
227 u64 start, u64 len, u64 disk_len, int type) 231 u64 start, u64 len, u64 disk_len, int type)
228{ 232{
229 return __btrfs_add_ordered_extent(inode, file_offset, start, len, 233 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
230 disk_len, type, 1); 234 disk_len, type, 1,
235 BTRFS_COMPRESS_NONE);
236}
237
238int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
239 u64 start, u64 len, u64 disk_len,
240 int type, int compress_type)
241{
242 return __btrfs_add_ordered_extent(inode, file_offset, start, len,
243 disk_len, type, 0,
244 compress_type);
231} 245}
232 246
233/* 247/*
@@ -250,6 +264,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
250 264
251/* 265/*
252 * this is used to account for finished IO across a given range 266 * this is used to account for finished IO across a given range
267 * of the file. The IO may span ordered extents. If
268 * a given ordered_extent is completely done, 1 is returned, otherwise
269 * 0.
270 *
271 * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
272 * to make sure this function only returns 1 once for a given ordered extent.
273 *
274 * file_offset is updated to one byte past the range that is recorded as
275 * complete. This allows you to walk forward in the file.
276 */
277int btrfs_dec_test_first_ordered_pending(struct inode *inode,
278 struct btrfs_ordered_extent **cached,
279 u64 *file_offset, u64 io_size)
280{
281 struct btrfs_ordered_inode_tree *tree;
282 struct rb_node *node;
283 struct btrfs_ordered_extent *entry = NULL;
284 int ret;
285 u64 dec_end;
286 u64 dec_start;
287 u64 to_dec;
288
289 tree = &BTRFS_I(inode)->ordered_tree;
290 spin_lock(&tree->lock);
291 node = tree_search(tree, *file_offset);
292 if (!node) {
293 ret = 1;
294 goto out;
295 }
296
297 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
298 if (!offset_in_entry(entry, *file_offset)) {
299 ret = 1;
300 goto out;
301 }
302
303 dec_start = max(*file_offset, entry->file_offset);
304 dec_end = min(*file_offset + io_size, entry->file_offset +
305 entry->len);
306 *file_offset = dec_end;
307 if (dec_start > dec_end) {
308 printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
309 (unsigned long long)dec_start,
310 (unsigned long long)dec_end);
311 }
312 to_dec = dec_end - dec_start;
313 if (to_dec > entry->bytes_left) {
314 printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
315 (unsigned long long)entry->bytes_left,
316 (unsigned long long)to_dec);
317 }
318 entry->bytes_left -= to_dec;
319 if (entry->bytes_left == 0)
320 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
321 else
322 ret = 1;
323out:
324 if (!ret && cached && entry) {
325 *cached = entry;
326 atomic_inc(&entry->refs);
327 }
328 spin_unlock(&tree->lock);
329 return ret == 0;
330}
331
332/*
333 * this is used to account for finished IO across a given range
253 * of the file. The IO should not span ordered extents. If 334 * of the file. The IO should not span ordered extents. If
254 * a given ordered_extent is completely done, 1 is returned, otherwise 335 * a given ordered_extent is completely done, 1 is returned, otherwise
255 * 0. 336 * 0.
@@ -308,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
308 struct list_head *cur; 389 struct list_head *cur;
309 struct btrfs_ordered_sum *sum; 390 struct btrfs_ordered_sum *sum;
310 391
392 trace_btrfs_ordered_extent_put(entry->inode, entry);
393
311 if (atomic_dec_and_test(&entry->refs)) { 394 if (atomic_dec_and_test(&entry->refs)) {
312 while (!list_empty(&entry->list)) { 395 while (!list_empty(&entry->list)) {
313 cur = entry->list.next; 396 cur = entry->list.next;
@@ -341,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
341 spin_lock(&root->fs_info->ordered_extent_lock); 424 spin_lock(&root->fs_info->ordered_extent_lock);
342 list_del_init(&entry->root_extent_list); 425 list_del_init(&entry->root_extent_list);
343 426
427 trace_btrfs_ordered_extent_remove(inode, entry);
428
344 /* 429 /*
345 * we have no more ordered extents for this inode and 430 * we have no more ordered extents for this inode and
346 * no dirty pages. We can safely remove it from the 431 * no dirty pages. We can safely remove it from the
@@ -506,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
506 u64 start = entry->file_offset; 591 u64 start = entry->file_offset;
507 u64 end = start + entry->len - 1; 592 u64 end = start + entry->len - 1;
508 593
594 trace_btrfs_ordered_extent_start(inode, entry);
595
509 /* 596 /*
510 * pages in the range can be dirty, clean or writeback. We 597 * pages in the range can be dirty, clean or writeback. We
511 * start IO on any dirty ones so the wait doesn't stall waiting 598 * start IO on any dirty ones so the wait doesn't stall waiting
@@ -526,7 +613,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
526{ 613{
527 u64 end; 614 u64 end;
528 u64 orig_end; 615 u64 orig_end;
529 u64 wait_end;
530 struct btrfs_ordered_extent *ordered; 616 struct btrfs_ordered_extent *ordered;
531 int found; 617 int found;
532 618
@@ -537,7 +623,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
537 if (orig_end > INT_LIMIT(loff_t)) 623 if (orig_end > INT_LIMIT(loff_t))
538 orig_end = INT_LIMIT(loff_t); 624 orig_end = INT_LIMIT(loff_t);
539 } 625 }
540 wait_end = orig_end;
541again: 626again:
542 /* start IO across the range first to instantiate any delalloc 627 /* start IO across the range first to instantiate any delalloc
543 * extents 628 * extents
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3f..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
68 68
69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ 69#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
70 70
71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ 71#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
72 72
73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ 73#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
74 74
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
93 /* flags (described above) */ 93 /* flags (described above) */
94 unsigned long flags; 94 unsigned long flags;
95 95
96 /* compression algorithm */
97 int compress_type;
98
96 /* reference count */ 99 /* reference count */
97 atomic_t refs; 100 atomic_t refs;
98 101
@@ -141,10 +144,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
141int btrfs_dec_test_ordered_pending(struct inode *inode, 144int btrfs_dec_test_ordered_pending(struct inode *inode,
142 struct btrfs_ordered_extent **cached, 145 struct btrfs_ordered_extent **cached,
143 u64 file_offset, u64 io_size); 146 u64 file_offset, u64 io_size);
147int btrfs_dec_test_first_ordered_pending(struct inode *inode,
148 struct btrfs_ordered_extent **cached,
149 u64 *file_offset, u64 io_size);
144int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 150int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
145 u64 start, u64 len, u64 disk_len, int type); 151 u64 start, u64 len, u64 disk_len, int type);
146int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, 152int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
147 u64 start, u64 len, u64 disk_len, int type); 153 u64 start, u64 len, u64 disk_len, int type);
154int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
155 u64 start, u64 len, u64 disk_len,
156 int type, int compress_type);
148int btrfs_add_ordered_sum(struct inode *inode, 157int btrfs_add_ordered_sum(struct inode *inode,
149 struct btrfs_ordered_extent *entry, 158 struct btrfs_ordered_extent *entry,
150 struct btrfs_ordered_sum *sum); 159 struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28e..f8be250963a0 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
56 return -ENOMEM; 56 return -ENOMEM;
57 57
58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 58 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
59 if (ret) 59 if (ret < 0)
60 goto out; 60 goto out;
61 if (ret) {
62 ret = -ENOENT;
63 goto out;
64 }
61 65
62 ret = btrfs_del_item(trans, root, path); 66 ret = btrfs_del_item(trans, root, path);
63 67
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
260#else 260#else
261 BUG(); 261 BUG();
262#endif 262#endif
263 break;
263 case BTRFS_BLOCK_GROUP_ITEM_KEY: 264 case BTRFS_BLOCK_GROUP_ITEM_KEY:
264 bi = btrfs_item_ptr(l, i, 265 bi = btrfs_item_ptr(l, i,
265 struct btrfs_block_group_item); 266 struct btrfs_block_group_item);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index a97314cf6bd6..82d569cb6267 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -23,56 +23,6 @@
23#include "ref-cache.h" 23#include "ref-cache.h"
24#include "transaction.h" 24#include "transaction.h"
25 25
26/*
27 * leaf refs are used to cache the information about which extents
28 * a given leaf has references on. This allows us to process that leaf
29 * in btrfs_drop_snapshot without needing to read it back from disk.
30 */
31
32/*
33 * kmalloc a leaf reference struct and update the counters for the
34 * total ref cache size
35 */
36struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
37 int nr_extents)
38{
39 struct btrfs_leaf_ref *ref;
40 size_t size = btrfs_leaf_ref_size(nr_extents);
41
42 ref = kmalloc(size, GFP_NOFS);
43 if (ref) {
44 spin_lock(&root->fs_info->ref_cache_lock);
45 root->fs_info->total_ref_cache_size += size;
46 spin_unlock(&root->fs_info->ref_cache_lock);
47
48 memset(ref, 0, sizeof(*ref));
49 atomic_set(&ref->usage, 1);
50 INIT_LIST_HEAD(&ref->list);
51 }
52 return ref;
53}
54
55/*
56 * free a leaf reference struct and update the counters for the
57 * total ref cache size
58 */
59void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
60{
61 if (!ref)
62 return;
63 WARN_ON(atomic_read(&ref->usage) == 0);
64 if (atomic_dec_and_test(&ref->usage)) {
65 size_t size = btrfs_leaf_ref_size(ref->nritems);
66
67 BUG_ON(ref->in_tree);
68 kfree(ref);
69
70 spin_lock(&root->fs_info->ref_cache_lock);
71 root->fs_info->total_ref_cache_size -= size;
72 spin_unlock(&root->fs_info->ref_cache_lock);
73 }
74}
75
76static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, 26static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
77 struct rb_node *node) 27 struct rb_node *node)
78{ 28{
@@ -116,117 +66,3 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
116 } 66 }
117 return NULL; 67 return NULL;
118} 68}
119
120int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
121 int shared)
122{
123 struct btrfs_leaf_ref *ref = NULL;
124 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
125
126 if (shared)
127 tree = &root->fs_info->shared_ref_tree;
128 if (!tree)
129 return 0;
130
131 spin_lock(&tree->lock);
132 while (!list_empty(&tree->list)) {
133 ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
134 BUG_ON(ref->tree != tree);
135 if (ref->root_gen > max_root_gen)
136 break;
137 if (!xchg(&ref->in_tree, 0)) {
138 cond_resched_lock(&tree->lock);
139 continue;
140 }
141
142 rb_erase(&ref->rb_node, &tree->root);
143 list_del_init(&ref->list);
144
145 spin_unlock(&tree->lock);
146 btrfs_free_leaf_ref(root, ref);
147 cond_resched();
148 spin_lock(&tree->lock);
149 }
150 spin_unlock(&tree->lock);
151 return 0;
152}
153
154/*
155 * find the leaf ref for a given extent. This returns the ref struct with
156 * a usage reference incremented
157 */
158struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
159 u64 bytenr)
160{
161 struct rb_node *rb;
162 struct btrfs_leaf_ref *ref = NULL;
163 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
164again:
165 if (tree) {
166 spin_lock(&tree->lock);
167 rb = tree_search(&tree->root, bytenr);
168 if (rb)
169 ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
170 if (ref)
171 atomic_inc(&ref->usage);
172 spin_unlock(&tree->lock);
173 if (ref)
174 return ref;
175 }
176 if (tree != &root->fs_info->shared_ref_tree) {
177 tree = &root->fs_info->shared_ref_tree;
178 goto again;
179 }
180 return NULL;
181}
182
183/*
184 * add a fully filled in leaf ref struct
185 * remove all the refs older than a given root generation
186 */
187int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
188 int shared)
189{
190 int ret = 0;
191 struct rb_node *rb;
192 struct btrfs_leaf_ref_tree *tree = root->ref_tree;
193
194 if (shared)
195 tree = &root->fs_info->shared_ref_tree;
196
197 spin_lock(&tree->lock);
198 rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
199 if (rb) {
200 ret = -EEXIST;
201 } else {
202 atomic_inc(&ref->usage);
203 ref->tree = tree;
204 ref->in_tree = 1;
205 list_add_tail(&ref->list, &tree->list);
206 }
207 spin_unlock(&tree->lock);
208 return ret;
209}
210
211/*
212 * remove a single leaf ref from the tree. This drops the ref held by the tree
213 * only
214 */
215int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
216{
217 struct btrfs_leaf_ref_tree *tree;
218
219 if (!xchg(&ref->in_tree, 0))
220 return 0;
221
222 tree = ref->tree;
223 spin_lock(&tree->lock);
224
225 rb_erase(&ref->rb_node, &tree->root);
226 list_del_init(&ref->list);
227
228 spin_unlock(&tree->lock);
229
230 btrfs_free_leaf_ref(root, ref);
231 return 0;
232}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index e2a55cb2072b..24f7001f6387 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -49,28 +49,4 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
49 return sizeof(struct btrfs_leaf_ref) + 49 return sizeof(struct btrfs_leaf_ref) +
50 sizeof(struct btrfs_extent_info) * nr_extents; 50 sizeof(struct btrfs_extent_info) * nr_extents;
51} 51}
52
53static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
54{
55 tree->root = RB_ROOT;
56 INIT_LIST_HEAD(&tree->list);
57 spin_lock_init(&tree->lock);
58}
59
60static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
61{
62 return RB_EMPTY_ROOT(&tree->root);
63}
64
65void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
66struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
67 int nr_extents);
68void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
69struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
70 u64 bytenr);
71int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
72 int shared);
73int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
74 int shared);
75int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
76#endif 52#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4a..5e0a3dc79a45 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,8 @@
29#include "locking.h" 29#include "locking.h"
30#include "btrfs_inode.h" 30#include "btrfs_inode.h"
31#include "async-thread.h" 31#include "async-thread.h"
32#include "free-space-cache.h"
33#include "inode-map.h"
32 34
33/* 35/*
34 * backref_node, mapping_node and tree_block start with this 36 * backref_node, mapping_node and tree_block start with this
@@ -178,8 +180,6 @@ struct reloc_control {
178 u64 search_start; 180 u64 search_start;
179 u64 extents_found; 181 u64 extents_found;
180 182
181 int block_rsv_retries;
182
183 unsigned int stage:8; 183 unsigned int stage:8;
184 unsigned int create_reloc_tree:1; 184 unsigned int create_reloc_tree:1;
185 unsigned int merge_reloc_tree:1; 185 unsigned int merge_reloc_tree:1;
@@ -508,6 +508,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
508 return 1; 508 return 1;
509} 509}
510 510
511
511static int should_ignore_root(struct btrfs_root *root) 512static int should_ignore_root(struct btrfs_root *root)
512{ 513{
513 struct btrfs_root *reloc_root; 514 struct btrfs_root *reloc_root;
@@ -530,7 +531,6 @@ static int should_ignore_root(struct btrfs_root *root)
530 */ 531 */
531 return 1; 532 return 1;
532} 533}
533
534/* 534/*
535 * find reloc tree by address of tree root 535 * find reloc tree by address of tree root
536 */ 536 */
@@ -677,6 +677,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
677 err = -ENOMEM; 677 err = -ENOMEM;
678 goto out; 678 goto out;
679 } 679 }
680 path1->reada = 1;
681 path2->reada = 2;
680 682
681 node = alloc_backref_node(cache); 683 node = alloc_backref_node(cache);
682 if (!node) { 684 if (!node) {
@@ -710,7 +712,7 @@ again:
710 WARN_ON(cur->checked); 712 WARN_ON(cur->checked);
711 if (!list_empty(&cur->upper)) { 713 if (!list_empty(&cur->upper)) {
712 /* 714 /*
713 * the backref was added previously when processsing 715 * the backref was added previously when processing
714 * backref of type BTRFS_TREE_BLOCK_REF_KEY 716 * backref of type BTRFS_TREE_BLOCK_REF_KEY
715 */ 717 */
716 BUG_ON(!list_is_singular(&cur->upper)); 718 BUG_ON(!list_is_singular(&cur->upper));
@@ -962,7 +964,7 @@ again:
962 lower = upper; 964 lower = upper;
963 upper = NULL; 965 upper = NULL;
964 } 966 }
965 btrfs_release_path(root, path2); 967 btrfs_release_path(path2);
966next: 968next:
967 if (ptr < end) { 969 if (ptr < end) {
968 ptr += btrfs_extent_inline_ref_size(key.type); 970 ptr += btrfs_extent_inline_ref_size(key.type);
@@ -975,7 +977,7 @@ next:
975 if (ptr >= end) 977 if (ptr >= end)
976 path1->slots[0]++; 978 path1->slots[0]++;
977 } 979 }
978 btrfs_release_path(rc->extent_root, path1); 980 btrfs_release_path(path1);
979 981
980 cur->checked = 1; 982 cur->checked = 1;
981 WARN_ON(exist); 983 WARN_ON(exist);
@@ -1158,6 +1160,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1158 new_node->bytenr = dest->node->start; 1160 new_node->bytenr = dest->node->start;
1159 new_node->level = node->level; 1161 new_node->level = node->level;
1160 new_node->lowest = node->lowest; 1162 new_node->lowest = node->lowest;
1163 new_node->checked = 1;
1161 new_node->root = dest; 1164 new_node->root = dest;
1162 1165
1163 if (!node->lowest) { 1166 if (!node->lowest) {
@@ -1365,7 +1368,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1365 int ret; 1368 int ret;
1366 1369
1367 if (!root->reloc_root) 1370 if (!root->reloc_root)
1368 return 0; 1371 goto out;
1369 1372
1370 reloc_root = root->reloc_root; 1373 reloc_root = root->reloc_root;
1371 root_item = &reloc_root->root_item; 1374 root_item = &reloc_root->root_item;
@@ -1387,6 +1390,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
1387 ret = btrfs_update_root(trans, root->fs_info->tree_root, 1390 ret = btrfs_update_root(trans, root->fs_info->tree_root,
1388 &reloc_root->root_key, root_item); 1391 &reloc_root->root_key, root_item);
1389 BUG_ON(ret); 1392 BUG_ON(ret);
1393
1394out:
1390 return 0; 1395 return 0;
1391} 1396}
1392 1397
@@ -1409,9 +1414,9 @@ again:
1409 prev = node; 1414 prev = node;
1410 entry = rb_entry(node, struct btrfs_inode, rb_node); 1415 entry = rb_entry(node, struct btrfs_inode, rb_node);
1411 1416
1412 if (objectid < entry->vfs_inode.i_ino) 1417 if (objectid < btrfs_ino(&entry->vfs_inode))
1413 node = node->rb_left; 1418 node = node->rb_left;
1414 else if (objectid > entry->vfs_inode.i_ino) 1419 else if (objectid > btrfs_ino(&entry->vfs_inode))
1415 node = node->rb_right; 1420 node = node->rb_right;
1416 else 1421 else
1417 break; 1422 break;
@@ -1419,7 +1424,7 @@ again:
1419 if (!node) { 1424 if (!node) {
1420 while (prev) { 1425 while (prev) {
1421 entry = rb_entry(prev, struct btrfs_inode, rb_node); 1426 entry = rb_entry(prev, struct btrfs_inode, rb_node);
1422 if (objectid <= entry->vfs_inode.i_ino) { 1427 if (objectid <= btrfs_ino(&entry->vfs_inode)) {
1423 node = prev; 1428 node = prev;
1424 break; 1429 break;
1425 } 1430 }
@@ -1434,7 +1439,7 @@ again:
1434 return inode; 1439 return inode;
1435 } 1440 }
1436 1441
1437 objectid = entry->vfs_inode.i_ino + 1; 1442 objectid = btrfs_ino(&entry->vfs_inode) + 1;
1438 if (cond_resched_lock(&root->inode_lock)) 1443 if (cond_resched_lock(&root->inode_lock))
1439 goto again; 1444 goto again;
1440 1445
@@ -1470,7 +1475,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1470 return -ENOMEM; 1475 return -ENOMEM;
1471 1476
1472 bytenr -= BTRFS_I(reloc_inode)->index_cnt; 1477 bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1473 ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, 1478 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
1474 bytenr, 0); 1479 bytenr, 0);
1475 if (ret < 0) 1480 if (ret < 0)
1476 goto out; 1481 goto out;
@@ -1558,11 +1563,11 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
1558 if (first) { 1563 if (first) {
1559 inode = find_next_inode(root, key.objectid); 1564 inode = find_next_inode(root, key.objectid);
1560 first = 0; 1565 first = 0;
1561 } else if (inode && inode->i_ino < key.objectid) { 1566 } else if (inode && btrfs_ino(inode) < key.objectid) {
1562 btrfs_add_delayed_iput(inode); 1567 btrfs_add_delayed_iput(inode);
1563 inode = find_next_inode(root, key.objectid); 1568 inode = find_next_inode(root, key.objectid);
1564 } 1569 }
1565 if (inode && inode->i_ino == key.objectid) { 1570 if (inode && btrfs_ino(inode) == key.objectid) {
1566 end = key.offset + 1571 end = key.offset +
1567 btrfs_file_extent_num_bytes(leaf, fi); 1572 btrfs_file_extent_num_bytes(leaf, fi);
1568 WARN_ON(!IS_ALIGNED(key.offset, 1573 WARN_ON(!IS_ALIGNED(key.offset,
@@ -1724,6 +1729,7 @@ again:
1724 1729
1725 eb = read_tree_block(dest, old_bytenr, blocksize, 1730 eb = read_tree_block(dest, old_bytenr, blocksize,
1726 old_ptr_gen); 1731 old_ptr_gen);
1732 BUG_ON(!eb);
1727 btrfs_tree_lock(eb); 1733 btrfs_tree_lock(eb);
1728 if (cow) { 1734 if (cow) {
1729 ret = btrfs_cow_block(trans, dest, eb, parent, 1735 ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -1748,7 +1754,7 @@ again:
1748 1754
1749 btrfs_node_key_to_cpu(path->nodes[level], &key, 1755 btrfs_node_key_to_cpu(path->nodes[level], &key,
1750 path->slots[level]); 1756 path->slots[level]);
1751 btrfs_release_path(src, path); 1757 btrfs_release_path(path);
1752 1758
1753 path->lowest_level = level; 1759 path->lowest_level = level;
1754 ret = btrfs_search_slot(trans, src, &key, path, 0, 1); 1760 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
@@ -1892,6 +1898,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1892 struct inode *inode = NULL; 1898 struct inode *inode = NULL;
1893 u64 objectid; 1899 u64 objectid;
1894 u64 start, end; 1900 u64 start, end;
1901 u64 ino;
1895 1902
1896 objectid = min_key->objectid; 1903 objectid = min_key->objectid;
1897 while (1) { 1904 while (1) {
@@ -1904,17 +1911,18 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1904 inode = find_next_inode(root, objectid); 1911 inode = find_next_inode(root, objectid);
1905 if (!inode) 1912 if (!inode)
1906 break; 1913 break;
1914 ino = btrfs_ino(inode);
1907 1915
1908 if (inode->i_ino > max_key->objectid) { 1916 if (ino > max_key->objectid) {
1909 iput(inode); 1917 iput(inode);
1910 break; 1918 break;
1911 } 1919 }
1912 1920
1913 objectid = inode->i_ino + 1; 1921 objectid = ino + 1;
1914 if (!S_ISREG(inode->i_mode)) 1922 if (!S_ISREG(inode->i_mode))
1915 continue; 1923 continue;
1916 1924
1917 if (unlikely(min_key->objectid == inode->i_ino)) { 1925 if (unlikely(min_key->objectid == ino)) {
1918 if (min_key->type > BTRFS_EXTENT_DATA_KEY) 1926 if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1919 continue; 1927 continue;
1920 if (min_key->type < BTRFS_EXTENT_DATA_KEY) 1928 if (min_key->type < BTRFS_EXTENT_DATA_KEY)
@@ -1927,7 +1935,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
1927 start = 0; 1935 start = 0;
1928 } 1936 }
1929 1937
1930 if (unlikely(max_key->objectid == inode->i_ino)) { 1938 if (unlikely(max_key->objectid == ino)) {
1931 if (max_key->type < BTRFS_EXTENT_DATA_KEY) 1939 if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1932 continue; 1940 continue;
1933 if (max_key->type > BTRFS_EXTENT_DATA_KEY) { 1941 if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
@@ -1995,6 +2003,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1995 path = btrfs_alloc_path(); 2003 path = btrfs_alloc_path();
1996 if (!path) 2004 if (!path)
1997 return -ENOMEM; 2005 return -ENOMEM;
2006 path->reada = 1;
1998 2007
1999 reloc_root = root->reloc_root; 2008 reloc_root = root->reloc_root;
2000 root_item = &reloc_root->root_item; 2009 root_item = &reloc_root->root_item;
@@ -2029,6 +2038,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2029 2038
2030 while (1) { 2039 while (1) {
2031 trans = btrfs_start_transaction(root, 0); 2040 trans = btrfs_start_transaction(root, 0);
2041 BUG_ON(IS_ERR(trans));
2032 trans->block_rsv = rc->block_rsv; 2042 trans->block_rsv = rc->block_rsv;
2033 2043
2034 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -2133,29 +2143,34 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2133 LIST_HEAD(reloc_roots); 2143 LIST_HEAD(reloc_roots);
2134 u64 num_bytes = 0; 2144 u64 num_bytes = 0;
2135 int ret; 2145 int ret;
2136 int retries = 0;
2137 2146
2138 mutex_lock(&root->fs_info->trans_mutex); 2147 mutex_lock(&root->fs_info->reloc_mutex);
2139 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2; 2148 rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
2140 rc->merging_rsv_size += rc->nodes_relocated * 2; 2149 rc->merging_rsv_size += rc->nodes_relocated * 2;
2141 mutex_unlock(&root->fs_info->trans_mutex); 2150 mutex_unlock(&root->fs_info->reloc_mutex);
2151
2142again: 2152again:
2143 if (!err) { 2153 if (!err) {
2144 num_bytes = rc->merging_rsv_size; 2154 num_bytes = rc->merging_rsv_size;
2145 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
2146 num_bytes, &retries); 2156 num_bytes);
2147 if (ret) 2157 if (ret)
2148 err = ret; 2158 err = ret;
2149 } 2159 }
2150 2160
2151 trans = btrfs_join_transaction(rc->extent_root, 1); 2161 trans = btrfs_join_transaction(rc->extent_root);
2162 if (IS_ERR(trans)) {
2163 if (!err)
2164 btrfs_block_rsv_release(rc->extent_root,
2165 rc->block_rsv, num_bytes);
2166 return PTR_ERR(trans);
2167 }
2152 2168
2153 if (!err) { 2169 if (!err) {
2154 if (num_bytes != rc->merging_rsv_size) { 2170 if (num_bytes != rc->merging_rsv_size) {
2155 btrfs_end_transaction(trans, rc->extent_root); 2171 btrfs_end_transaction(trans, rc->extent_root);
2156 btrfs_block_rsv_release(rc->extent_root, 2172 btrfs_block_rsv_release(rc->extent_root,
2157 rc->block_rsv, num_bytes); 2173 rc->block_rsv, num_bytes);
2158 retries = 0;
2159 goto again; 2174 goto again;
2160 } 2175 }
2161 } 2176 }
@@ -2202,9 +2217,16 @@ int merge_reloc_roots(struct reloc_control *rc)
2202 int ret; 2217 int ret;
2203again: 2218again:
2204 root = rc->extent_root; 2219 root = rc->extent_root;
2205 mutex_lock(&root->fs_info->trans_mutex); 2220
2221 /*
2222 * this serializes us with btrfs_record_root_in_transaction,
2223 * we have to make sure nobody is in the middle of
2224 * adding their roots to the list while we are
2225 * doing this splice
2226 */
2227 mutex_lock(&root->fs_info->reloc_mutex);
2206 list_splice_init(&rc->reloc_roots, &reloc_roots); 2228 list_splice_init(&rc->reloc_roots, &reloc_roots);
2207 mutex_unlock(&root->fs_info->trans_mutex); 2229 mutex_unlock(&root->fs_info->reloc_mutex);
2208 2230
2209 while (!list_empty(&reloc_roots)) { 2231 while (!list_empty(&reloc_roots)) {
2210 found = 1; 2232 found = 1;
@@ -2340,7 +2362,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
2340 root = next->root; 2362 root = next->root;
2341 BUG_ON(!root); 2363 BUG_ON(!root);
2342 2364
2343 /* no other choice for non-refernce counted tree */ 2365 /* no other choice for non-references counted tree */
2344 if (!root->ref_cows) 2366 if (!root->ref_cows)
2345 return root; 2367 return root;
2346 2368
@@ -2405,15 +2427,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2405 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2406 2428
2407 trans->block_rsv = rc->block_rsv; 2429 trans->block_rsv = rc->block_rsv;
2408 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes, 2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
2409 &rc->block_rsv_retries);
2410 if (ret) { 2431 if (ret) {
2411 if (ret == -EAGAIN) 2432 if (ret == -EAGAIN)
2412 rc->commit_transaction = 1; 2433 rc->commit_transaction = 1;
2413 return ret; 2434 return ret;
2414 } 2435 }
2415 2436
2416 rc->block_rsv_retries = 0;
2417 return 0; 2437 return 0;
2418} 2438}
2419 2439
@@ -2492,7 +2512,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2492 path->locks[upper->level] = 0; 2512 path->locks[upper->level] = 0;
2493 2513
2494 slot = path->slots[upper->level]; 2514 slot = path->slots[upper->level];
2495 btrfs_release_path(NULL, path); 2515 btrfs_release_path(path);
2496 } else { 2516 } else {
2497 ret = btrfs_bin_search(upper->eb, key, upper->level, 2517 ret = btrfs_bin_search(upper->eb, key, upper->level,
2498 &slot); 2518 &slot);
@@ -2510,6 +2530,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
2510 blocksize = btrfs_level_size(root, node->level); 2530 blocksize = btrfs_level_size(root, node->level);
2511 generation = btrfs_node_ptr_generation(upper->eb, slot); 2531 generation = btrfs_node_ptr_generation(upper->eb, slot);
2512 eb = read_tree_block(root, bytenr, blocksize, generation); 2532 eb = read_tree_block(root, bytenr, blocksize, generation);
2533 if (!eb) {
2534 err = -EIO;
2535 goto next;
2536 }
2513 btrfs_tree_lock(eb); 2537 btrfs_tree_lock(eb);
2514 btrfs_set_lock_blocking(eb); 2538 btrfs_set_lock_blocking(eb);
2515 2539
@@ -2667,6 +2691,7 @@ static int get_tree_block_key(struct reloc_control *rc,
2667 BUG_ON(block->key_ready); 2691 BUG_ON(block->key_ready);
2668 eb = read_tree_block(rc->extent_root, block->bytenr, 2692 eb = read_tree_block(rc->extent_root, block->bytenr,
2669 block->key.objectid, block->key.offset); 2693 block->key.objectid, block->key.offset);
2694 BUG_ON(!eb);
2670 WARN_ON(btrfs_header_level(eb) != block->level); 2695 WARN_ON(btrfs_header_level(eb) != block->level);
2671 if (block->level == 0) 2696 if (block->level == 0)
2672 btrfs_item_key_to_cpu(eb, &block->key, 0); 2697 btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -2728,7 +2753,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
2728 } else { 2753 } else {
2729 path->lowest_level = node->level; 2754 path->lowest_level = node->level;
2730 ret = btrfs_search_slot(trans, root, key, path, 0, 1); 2755 ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2731 btrfs_release_path(root, path); 2756 btrfs_release_path(path);
2732 if (ret > 0) 2757 if (ret > 0)
2733 ret = 0; 2758 ret = 0;
2734 } 2759 }
@@ -2861,7 +2886,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
2861 struct extent_map *em; 2886 struct extent_map *em;
2862 int ret = 0; 2887 int ret = 0;
2863 2888
2864 em = alloc_extent_map(GFP_NOFS); 2889 em = alloc_extent_map();
2865 if (!em) 2890 if (!em)
2866 return -ENOMEM; 2891 return -ENOMEM;
2867 2892
@@ -3099,6 +3124,8 @@ static int add_tree_block(struct reloc_control *rc,
3099 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0)); 3124 BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
3100 ret = get_ref_objectid_v0(rc, path, extent_key, 3125 ret = get_ref_objectid_v0(rc, path, extent_key,
3101 &ref_owner, NULL); 3126 &ref_owner, NULL);
3127 if (ret < 0)
3128 return ret;
3102 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL); 3129 BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
3103 level = (int)ref_owner; 3130 level = (int)ref_owner;
3104 /* FIXME: get real generation */ 3131 /* FIXME: get real generation */
@@ -3108,7 +3135,7 @@ static int add_tree_block(struct reloc_control *rc,
3108#endif 3135#endif
3109 } 3136 }
3110 3137
3111 btrfs_release_path(rc->extent_root, path); 3138 btrfs_release_path(path);
3112 3139
3113 BUG_ON(level == -1); 3140 BUG_ON(level == -1);
3114 3141
@@ -3191,6 +3218,55 @@ static int block_use_full_backref(struct reloc_control *rc,
3191 return ret; 3218 return ret;
3192} 3219}
3193 3220
3221static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3222 struct inode *inode, u64 ino)
3223{
3224 struct btrfs_key key;
3225 struct btrfs_path *path;
3226 struct btrfs_root *root = fs_info->tree_root;
3227 struct btrfs_trans_handle *trans;
3228 unsigned long nr;
3229 int ret = 0;
3230
3231 if (inode)
3232 goto truncate;
3233
3234 key.objectid = ino;
3235 key.type = BTRFS_INODE_ITEM_KEY;
3236 key.offset = 0;
3237
3238 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
3239 if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
3240 if (inode && !IS_ERR(inode))
3241 iput(inode);
3242 return -ENOENT;
3243 }
3244
3245truncate:
3246 path = btrfs_alloc_path();
3247 if (!path) {
3248 ret = -ENOMEM;
3249 goto out;
3250 }
3251
3252 trans = btrfs_join_transaction(root);
3253 if (IS_ERR(trans)) {
3254 btrfs_free_path(path);
3255 ret = PTR_ERR(trans);
3256 goto out;
3257 }
3258
3259 ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
3260
3261 btrfs_free_path(path);
3262 nr = trans->blocks_used;
3263 btrfs_end_transaction(trans, root);
3264 btrfs_btree_balance_dirty(root, nr);
3265out:
3266 iput(inode);
3267 return ret;
3268}
3269
3194/* 3270/*
3195 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY 3271 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
3196 * this function scans fs tree to find blocks reference the data extent 3272 * this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3293,28 @@ static int find_data_references(struct reloc_control *rc,
3217 int counted; 3293 int counted;
3218 int ret; 3294 int ret;
3219 3295
3220 path = btrfs_alloc_path();
3221 if (!path)
3222 return -ENOMEM;
3223
3224 ref_root = btrfs_extent_data_ref_root(leaf, ref); 3296 ref_root = btrfs_extent_data_ref_root(leaf, ref);
3225 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref); 3297 ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
3226 ref_offset = btrfs_extent_data_ref_offset(leaf, ref); 3298 ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
3227 ref_count = btrfs_extent_data_ref_count(leaf, ref); 3299 ref_count = btrfs_extent_data_ref_count(leaf, ref);
3228 3300
3301 /*
3302 * This is an extent belonging to the free space cache, lets just delete
3303 * it and redo the search.
3304 */
3305 if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
3306 ret = delete_block_group_cache(rc->extent_root->fs_info,
3307 NULL, ref_objectid);
3308 if (ret != -ENOENT)
3309 return ret;
3310 ret = 0;
3311 }
3312
3313 path = btrfs_alloc_path();
3314 if (!path)
3315 return -ENOMEM;
3316 path->reada = 1;
3317
3229 root = read_fs_root(rc->extent_root->fs_info, ref_root); 3318 root = read_fs_root(rc->extent_root->fs_info, ref_root);
3230 if (IS_ERR(root)) { 3319 if (IS_ERR(root)) {
3231 err = PTR_ERR(root); 3320 err = PTR_ERR(root);
@@ -3433,7 +3522,7 @@ int add_data_references(struct reloc_control *rc,
3433 } 3522 }
3434 path->slots[0]++; 3523 path->slots[0]++;
3435 } 3524 }
3436 btrfs_release_path(rc->extent_root, path); 3525 btrfs_release_path(path);
3437 if (err) 3526 if (err)
3438 free_block_list(blocks); 3527 free_block_list(blocks);
3439 return err; 3528 return err;
@@ -3496,7 +3585,7 @@ next:
3496 EXTENT_DIRTY); 3585 EXTENT_DIRTY);
3497 3586
3498 if (ret == 0 && start <= key.objectid) { 3587 if (ret == 0 && start <= key.objectid) {
3499 btrfs_release_path(rc->extent_root, path); 3588 btrfs_release_path(path);
3500 rc->search_start = end + 1; 3589 rc->search_start = end + 1;
3501 } else { 3590 } else {
3502 rc->search_start = key.objectid + key.offset; 3591 rc->search_start = key.objectid + key.offset;
@@ -3504,24 +3593,26 @@ next:
3504 return 0; 3593 return 0;
3505 } 3594 }
3506 } 3595 }
3507 btrfs_release_path(rc->extent_root, path); 3596 btrfs_release_path(path);
3508 return ret; 3597 return ret;
3509} 3598}
3510 3599
3511static void set_reloc_control(struct reloc_control *rc) 3600static void set_reloc_control(struct reloc_control *rc)
3512{ 3601{
3513 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3602 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3514 mutex_lock(&fs_info->trans_mutex); 3603
3604 mutex_lock(&fs_info->reloc_mutex);
3515 fs_info->reloc_ctl = rc; 3605 fs_info->reloc_ctl = rc;
3516 mutex_unlock(&fs_info->trans_mutex); 3606 mutex_unlock(&fs_info->reloc_mutex);
3517} 3607}
3518 3608
3519static void unset_reloc_control(struct reloc_control *rc) 3609static void unset_reloc_control(struct reloc_control *rc)
3520{ 3610{
3521 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; 3611 struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3522 mutex_lock(&fs_info->trans_mutex); 3612
3613 mutex_lock(&fs_info->reloc_mutex);
3523 fs_info->reloc_ctl = NULL; 3614 fs_info->reloc_ctl = NULL;
3524 mutex_unlock(&fs_info->trans_mutex); 3615 mutex_unlock(&fs_info->reloc_mutex);
3525} 3616}
3526 3617
3527static int check_extent_flags(u64 flags) 3618static int check_extent_flags(u64 flags)
@@ -3554,8 +3645,7 @@ int prepare_to_relocate(struct reloc_control *rc)
3554 * is no reservation in transaction handle. 3645 * is no reservation in transaction handle.
3555 */ 3646 */
3556 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3647 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
3557 rc->extent_root->nodesize * 256, 3648 rc->extent_root->nodesize * 256);
3558 &rc->block_rsv_retries);
3559 if (ret) 3649 if (ret)
3560 return ret; 3650 return ret;
3561 3651
@@ -3567,12 +3657,12 @@ int prepare_to_relocate(struct reloc_control *rc)
3567 rc->extents_found = 0; 3657 rc->extents_found = 0;
3568 rc->nodes_relocated = 0; 3658 rc->nodes_relocated = 0;
3569 rc->merging_rsv_size = 0; 3659 rc->merging_rsv_size = 0;
3570 rc->block_rsv_retries = 0;
3571 3660
3572 rc->create_reloc_tree = 1; 3661 rc->create_reloc_tree = 1;
3573 set_reloc_control(rc); 3662 set_reloc_control(rc);
3574 3663
3575 trans = btrfs_join_transaction(rc->extent_root, 1); 3664 trans = btrfs_join_transaction(rc->extent_root);
3665 BUG_ON(IS_ERR(trans));
3576 btrfs_commit_transaction(trans, rc->extent_root); 3666 btrfs_commit_transaction(trans, rc->extent_root);
3577 return 0; 3667 return 0;
3578} 3668}
@@ -3589,10 +3679,12 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3589 u32 item_size; 3679 u32 item_size;
3590 int ret; 3680 int ret;
3591 int err = 0; 3681 int err = 0;
3682 int progress = 0;
3592 3683
3593 path = btrfs_alloc_path(); 3684 path = btrfs_alloc_path();
3594 if (!path) 3685 if (!path)
3595 return -ENOMEM; 3686 return -ENOMEM;
3687 path->reada = 1;
3596 3688
3597 ret = prepare_to_relocate(rc); 3689 ret = prepare_to_relocate(rc);
3598 if (ret) { 3690 if (ret) {
@@ -3601,8 +3693,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3601 } 3693 }
3602 3694
3603 while (1) { 3695 while (1) {
3696 progress++;
3604 trans = btrfs_start_transaction(rc->extent_root, 0); 3697 trans = btrfs_start_transaction(rc->extent_root, 0);
3605 3698 BUG_ON(IS_ERR(trans));
3699restart:
3606 if (update_backref_cache(trans, &rc->backref_cache)) { 3700 if (update_backref_cache(trans, &rc->backref_cache)) {
3607 btrfs_end_transaction(trans, rc->extent_root); 3701 btrfs_end_transaction(trans, rc->extent_root);
3608 continue; 3702 continue;
@@ -3639,7 +3733,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3639 flags = BTRFS_EXTENT_FLAG_DATA; 3733 flags = BTRFS_EXTENT_FLAG_DATA;
3640 3734
3641 if (path_change) { 3735 if (path_change) {
3642 btrfs_release_path(rc->extent_root, path); 3736 btrfs_release_path(path);
3643 3737
3644 path->search_commit_root = 1; 3738 path->search_commit_root = 1;
3645 path->skip_locking = 1; 3739 path->skip_locking = 1;
@@ -3662,7 +3756,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3662 (flags & BTRFS_EXTENT_FLAG_DATA)) { 3756 (flags & BTRFS_EXTENT_FLAG_DATA)) {
3663 ret = add_data_references(rc, &key, path, &blocks); 3757 ret = add_data_references(rc, &key, path, &blocks);
3664 } else { 3758 } else {
3665 btrfs_release_path(rc->extent_root, path); 3759 btrfs_release_path(path);
3666 ret = 0; 3760 ret = 0;
3667 } 3761 }
3668 if (ret < 0) { 3762 if (ret < 0) {
@@ -3715,8 +3809,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3715 } 3809 }
3716 } 3810 }
3717 } 3811 }
3812 if (trans && progress && err == -ENOSPC) {
3813 ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
3814 rc->block_group->flags);
3815 if (ret == 0) {
3816 err = 0;
3817 progress = 0;
3818 goto restart;
3819 }
3820 }
3718 3821
3719 btrfs_release_path(rc->extent_root, path); 3822 btrfs_release_path(path);
3720 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY, 3823 clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
3721 GFP_NOFS); 3824 GFP_NOFS);
3722 3825
@@ -3748,8 +3851,11 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3748 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1); 3851 btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
3749 3852
3750 /* get rid of pinned extents */ 3853 /* get rid of pinned extents */
3751 trans = btrfs_join_transaction(rc->extent_root, 1); 3854 trans = btrfs_join_transaction(rc->extent_root);
3752 btrfs_commit_transaction(trans, rc->extent_root); 3855 if (IS_ERR(trans))
3856 err = PTR_ERR(trans);
3857 else
3858 btrfs_commit_transaction(trans, rc->extent_root);
3753out_free: 3859out_free:
3754 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv); 3860 btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
3755 btrfs_free_path(path); 3861 btrfs_free_path(path);
@@ -3781,7 +3887,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3781 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | 3887 btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3782 BTRFS_INODE_PREALLOC); 3888 BTRFS_INODE_PREALLOC);
3783 btrfs_mark_buffer_dirty(leaf); 3889 btrfs_mark_buffer_dirty(leaf);
3784 btrfs_release_path(root, path); 3890 btrfs_release_path(path);
3785out: 3891out:
3786 btrfs_free_path(path); 3892 btrfs_free_path(path);
3787 return ret; 3893 return ret;
@@ -3811,7 +3917,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3811 if (IS_ERR(trans)) 3917 if (IS_ERR(trans))
3812 return ERR_CAST(trans); 3918 return ERR_CAST(trans);
3813 3919
3814 err = btrfs_find_free_objectid(trans, root, objectid, &objectid); 3920 err = btrfs_find_free_objectid(root, &objectid);
3815 if (err) 3921 if (err)
3816 goto out; 3922 goto out;
3817 3923
@@ -3849,7 +3955,7 @@ static struct reloc_control *alloc_reloc_control(void)
3849 INIT_LIST_HEAD(&rc->reloc_roots); 3955 INIT_LIST_HEAD(&rc->reloc_roots);
3850 backref_cache_init(&rc->backref_cache); 3956 backref_cache_init(&rc->backref_cache);
3851 mapping_tree_init(&rc->reloc_root_tree); 3957 mapping_tree_init(&rc->reloc_root_tree);
3852 extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS); 3958 extent_io_tree_init(&rc->processed_blocks, NULL);
3853 return rc; 3959 return rc;
3854} 3960}
3855 3961
@@ -3860,6 +3966,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3860{ 3966{
3861 struct btrfs_fs_info *fs_info = extent_root->fs_info; 3967 struct btrfs_fs_info *fs_info = extent_root->fs_info;
3862 struct reloc_control *rc; 3968 struct reloc_control *rc;
3969 struct inode *inode;
3970 struct btrfs_path *path;
3863 int ret; 3971 int ret;
3864 int rw = 0; 3972 int rw = 0;
3865 int err = 0; 3973 int err = 0;
@@ -3882,6 +3990,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
3882 rw = 1; 3990 rw = 1;
3883 } 3991 }
3884 3992
3993 path = btrfs_alloc_path();
3994 if (!path) {
3995 err = -ENOMEM;
3996 goto out;
3997 }
3998
3999 inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
4000 path);
4001 btrfs_free_path(path);
4002
4003 if (!IS_ERR(inode))
4004 ret = delete_block_group_cache(fs_info, inode, 0);
4005 else
4006 ret = PTR_ERR(inode);
4007
4008 if (ret && ret != -ENOENT) {
4009 err = ret;
4010 goto out;
4011 }
4012
3885 rc->data_inode = create_reloc_inode(fs_info, rc->block_group); 4013 rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
3886 if (IS_ERR(rc->data_inode)) { 4014 if (IS_ERR(rc->data_inode)) {
3887 err = PTR_ERR(rc->data_inode); 4015 err = PTR_ERR(rc->data_inode);
@@ -3945,6 +4073,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
3945 int ret; 4073 int ret;
3946 4074
3947 trans = btrfs_start_transaction(root->fs_info->tree_root, 0); 4075 trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
4076 BUG_ON(IS_ERR(trans));
3948 4077
3949 memset(&root->root_item.drop_progress, 0, 4078 memset(&root->root_item.drop_progress, 0,
3950 sizeof(root->root_item.drop_progress)); 4079 sizeof(root->root_item.drop_progress));
@@ -3981,6 +4110,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
3981 path = btrfs_alloc_path(); 4110 path = btrfs_alloc_path();
3982 if (!path) 4111 if (!path)
3983 return -ENOMEM; 4112 return -ENOMEM;
4113 path->reada = -1;
3984 4114
3985 key.objectid = BTRFS_TREE_RELOC_OBJECTID; 4115 key.objectid = BTRFS_TREE_RELOC_OBJECTID;
3986 key.type = BTRFS_ROOT_ITEM_KEY; 4116 key.type = BTRFS_ROOT_ITEM_KEY;
@@ -4000,7 +4130,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4000 } 4130 }
4001 leaf = path->nodes[0]; 4131 leaf = path->nodes[0];
4002 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 4132 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4003 btrfs_release_path(root->fs_info->tree_root, path); 4133 btrfs_release_path(path);
4004 4134
4005 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID || 4135 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4006 key.type != BTRFS_ROOT_ITEM_KEY) 4136 key.type != BTRFS_ROOT_ITEM_KEY)
@@ -4032,7 +4162,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4032 4162
4033 key.offset--; 4163 key.offset--;
4034 } 4164 }
4035 btrfs_release_path(root->fs_info->tree_root, path); 4165 btrfs_release_path(path);
4036 4166
4037 if (list_empty(&reloc_roots)) 4167 if (list_empty(&reloc_roots))
4038 goto out; 4168 goto out;
@@ -4047,7 +4177,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4047 4177
4048 set_reloc_control(rc); 4178 set_reloc_control(rc);
4049 4179
4050 trans = btrfs_join_transaction(rc->extent_root, 1); 4180 trans = btrfs_join_transaction(rc->extent_root);
4181 if (IS_ERR(trans)) {
4182 unset_reloc_control(rc);
4183 err = PTR_ERR(trans);
4184 goto out_free;
4185 }
4051 4186
4052 rc->merge_reloc_tree = 1; 4187 rc->merge_reloc_tree = 1;
4053 4188
@@ -4076,10 +4211,14 @@ int btrfs_recover_relocation(struct btrfs_root *root)
4076 4211
4077 unset_reloc_control(rc); 4212 unset_reloc_control(rc);
4078 4213
4079 trans = btrfs_join_transaction(rc->extent_root, 1); 4214 trans = btrfs_join_transaction(rc->extent_root);
4080 btrfs_commit_transaction(trans, rc->extent_root); 4215 if (IS_ERR(trans))
4081out: 4216 err = PTR_ERR(trans);
4217 else
4218 btrfs_commit_transaction(trans, rc->extent_root);
4219out_free:
4082 kfree(rc); 4220 kfree(rc);
4221out:
4083 while (!list_empty(&reloc_roots)) { 4222 while (!list_empty(&reloc_roots)) {
4084 reloc_root = list_entry(reloc_roots.next, 4223 reloc_root = list_entry(reloc_roots.next,
4085 struct btrfs_root, root_list); 4224 struct btrfs_root, root_list);
@@ -4097,7 +4236,7 @@ out:
4097 if (IS_ERR(fs_root)) 4236 if (IS_ERR(fs_root))
4098 err = PTR_ERR(fs_root); 4237 err = PTR_ERR(fs_root);
4099 else 4238 else
4100 btrfs_orphan_cleanup(fs_root); 4239 err = btrfs_orphan_cleanup(fs_root);
4101 } 4240 }
4102 return err; 4241 return err;
4103} 4242}
@@ -4124,7 +4263,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4124 4263
4125 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; 4264 disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
4126 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, 4265 ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
4127 disk_bytenr + len - 1, &list); 4266 disk_bytenr + len - 1, &list, 0);
4128 4267
4129 while (!list_empty(&list)) { 4268 while (!list_empty(&list)) {
4130 sums = list_entry(list.next, struct btrfs_ordered_sum, list); 4269 sums = list_entry(list.next, struct btrfs_ordered_sum, list);
@@ -4143,7 +4282,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
4143 btrfs_add_ordered_sum(inode, ordered, sums); 4282 btrfs_add_ordered_sum(inode, ordered, sums);
4144 } 4283 }
4145 btrfs_put_ordered_extent(ordered); 4284 btrfs_put_ordered_extent(ordered);
4146 return 0; 4285 return ret;
4147} 4286}
4148 4287
4149void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, 4288void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2d958be761c8..ebe45443de06 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,53 +22,6 @@
22#include "print-tree.h" 22#include "print-tree.h"
23 23
24/* 24/*
25 * search forward for a root, starting with objectid 'search_start'
26 * if a root key is found, the objectid we find is filled into 'found_objectid'
27 * and 0 is returned. < 0 is returned on error, 1 if there is nothing
28 * left in the tree.
29 */
30int btrfs_search_root(struct btrfs_root *root, u64 search_start,
31 u64 *found_objectid)
32{
33 struct btrfs_path *path;
34 struct btrfs_key search_key;
35 int ret;
36
37 root = root->fs_info->tree_root;
38 search_key.objectid = search_start;
39 search_key.type = (u8)-1;
40 search_key.offset = (u64)-1;
41
42 path = btrfs_alloc_path();
43 BUG_ON(!path);
44again:
45 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
46 if (ret < 0)
47 goto out;
48 if (ret == 0) {
49 ret = 1;
50 goto out;
51 }
52 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
53 ret = btrfs_next_leaf(root, path);
54 if (ret)
55 goto out;
56 }
57 btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
58 if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
59 search_key.offset++;
60 btrfs_release_path(root, path);
61 goto again;
62 }
63 ret = 0;
64 *found_objectid = search_key.objectid;
65
66out:
67 btrfs_free_path(path);
68 return ret;
69}
70
71/*
72 * lookup the root with the highest offset for a given objectid. The key we do 25 * lookup the root with the highest offset for a given objectid. The key we do
73 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 26 * find is copied into 'key'. If we find something return 0, otherwise 1, < 0
74 * on error. 27 * on error.
@@ -88,7 +41,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
88 search_key.offset = (u64)-1; 41 search_key.offset = (u64)-1;
89 42
90 path = btrfs_alloc_path(); 43 path = btrfs_alloc_path();
91 BUG_ON(!path); 44 if (!path)
45 return -ENOMEM;
92 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); 46 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
93 if (ret < 0) 47 if (ret < 0)
94 goto out; 48 goto out;
@@ -181,7 +135,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
181int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid) 135int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
182{ 136{
183 struct btrfs_root *dead_root; 137 struct btrfs_root *dead_root;
184 struct btrfs_item *item;
185 struct btrfs_root_item *ri; 138 struct btrfs_root_item *ri;
186 struct btrfs_key key; 139 struct btrfs_key key;
187 struct btrfs_key found_key; 140 struct btrfs_key found_key;
@@ -214,7 +167,6 @@ again:
214 nritems = btrfs_header_nritems(leaf); 167 nritems = btrfs_header_nritems(leaf);
215 slot = path->slots[0]; 168 slot = path->slots[0];
216 } 169 }
217 item = btrfs_item_nr(leaf, slot);
218 btrfs_item_key_to_cpu(leaf, &key, slot); 170 btrfs_item_key_to_cpu(leaf, &key, slot);
219 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) 171 if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
220 goto next; 172 goto next;
@@ -231,7 +183,7 @@ again:
231 183
232 memcpy(&found_key, &key, sizeof(key)); 184 memcpy(&found_key, &key, sizeof(key));
233 key.offset++; 185 key.offset++;
234 btrfs_release_path(root, path); 186 btrfs_release_path(path);
235 dead_root = 187 dead_root =
236 btrfs_read_fs_root_no_radix(root->fs_info->tree_root, 188 btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
237 &found_key); 189 &found_key);
@@ -293,7 +245,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
293 } 245 }
294 246
295 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 247 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
296 btrfs_release_path(tree_root, path); 248 btrfs_release_path(path);
297 249
298 if (key.objectid != BTRFS_ORPHAN_OBJECTID || 250 if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
299 key.type != BTRFS_ORPHAN_ITEM_KEY) 251 key.type != BTRFS_ORPHAN_ITEM_KEY)
@@ -334,7 +286,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
334 struct extent_buffer *leaf; 286 struct extent_buffer *leaf;
335 287
336 path = btrfs_alloc_path(); 288 path = btrfs_alloc_path();
337 BUG_ON(!path); 289 if (!path)
290 return -ENOMEM;
338 ret = btrfs_search_slot(trans, root, key, path, -1, 1); 291 ret = btrfs_search_slot(trans, root, key, path, -1, 1);
339 if (ret < 0) 292 if (ret < 0)
340 goto out; 293 goto out;
@@ -385,18 +338,22 @@ again:
385 *sequence = btrfs_root_ref_sequence(leaf, ref); 338 *sequence = btrfs_root_ref_sequence(leaf, ref);
386 339
387 ret = btrfs_del_item(trans, tree_root, path); 340 ret = btrfs_del_item(trans, tree_root, path);
388 BUG_ON(ret); 341 if (ret) {
342 err = ret;
343 goto out;
344 }
389 } else 345 } else
390 err = -ENOENT; 346 err = -ENOENT;
391 347
392 if (key.type == BTRFS_ROOT_BACKREF_KEY) { 348 if (key.type == BTRFS_ROOT_BACKREF_KEY) {
393 btrfs_release_path(tree_root, path); 349 btrfs_release_path(path);
394 key.objectid = ref_id; 350 key.objectid = ref_id;
395 key.type = BTRFS_ROOT_REF_KEY; 351 key.type = BTRFS_ROOT_REF_KEY;
396 key.offset = root_id; 352 key.offset = root_id;
397 goto again; 353 goto again;
398 } 354 }
399 355
356out:
400 btrfs_free_path(path); 357 btrfs_free_path(path);
401 return err; 358 return err;
402} 359}
@@ -463,7 +420,7 @@ again:
463 btrfs_mark_buffer_dirty(leaf); 420 btrfs_mark_buffer_dirty(leaf);
464 421
465 if (key.type == BTRFS_ROOT_BACKREF_KEY) { 422 if (key.type == BTRFS_ROOT_BACKREF_KEY) {
466 btrfs_release_path(tree_root, path); 423 btrfs_release_path(path);
467 key.objectid = ref_id; 424 key.objectid = ref_id;
468 key.type = BTRFS_ROOT_REF_KEY; 425 key.type = BTRFS_ROOT_REF_KEY;
469 key.offset = root_id; 426 key.offset = root_id;
@@ -473,3 +430,21 @@ again:
473 btrfs_free_path(path); 430 btrfs_free_path(path);
474 return 0; 431 return 0;
475} 432}
433
434/*
435 * Old btrfs forgets to init root_item->flags and root_item->byte_limit
436 * for subvolumes. To work around this problem, we steal a bit from
437 * root_item->inode_item->flags, and use it to indicate if those fields
438 * have been properly initialized.
439 */
440void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
441{
442 u64 inode_flags = le64_to_cpu(root_item->inode.flags);
443
444 if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
445 inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
446 root_item->inode.flags = cpu_to_le64(inode_flags);
447 root_item->flags = 0;
448 root_item->byte_limit = 0;
449 }
450}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 000000000000..a8d03d5efb5d
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,1395 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/blkdev.h>
20#include "ctree.h"
21#include "volumes.h"
22#include "disk-io.h"
23#include "ordered-data.h"
24
25/*
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
29 * any can be found.
30 *
31 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */
42
43struct scrub_bio;
44struct scrub_page;
45struct scrub_dev;
46static void scrub_bio_end_io(struct bio *bio, int err);
47static void scrub_checksum(struct btrfs_work *work);
48static int scrub_checksum_data(struct scrub_dev *sdev,
49 struct scrub_page *spag, void *buffer);
50static int scrub_checksum_tree_block(struct scrub_dev *sdev,
51 struct scrub_page *spag, u64 logical,
52 void *buffer);
53static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
54static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
55static void scrub_fixup_end_io(struct bio *bio, int err);
56static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
57 struct page *page);
58static void scrub_fixup(struct scrub_bio *sbio, int ix);
59
60#define SCRUB_PAGES_PER_BIO 16 /* 64k per bio */
61#define SCRUB_BIOS_PER_DEV 16 /* 1 MB per device in flight */
62
63struct scrub_page {
64 u64 flags; /* extent flags */
65 u64 generation;
66 u64 mirror_num;
67 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE];
69};
70
71struct scrub_bio {
72 int index;
73 struct scrub_dev *sdev;
74 struct bio *bio;
75 int err;
76 u64 logical;
77 u64 physical;
78 struct scrub_page spag[SCRUB_PAGES_PER_BIO];
79 u64 count;
80 int next_free;
81 struct btrfs_work work;
82};
83
84struct scrub_dev {
85 struct scrub_bio *bios[SCRUB_BIOS_PER_DEV];
86 struct btrfs_device *dev;
87 int first_free;
88 int curr;
89 atomic_t in_flight;
90 spinlock_t list_lock;
91 wait_queue_head_t list_wait;
92 u16 csum_size;
93 struct list_head csum_list;
94 atomic_t cancel_req;
95 int readonly;
96 /*
97 * statistics
98 */
99 struct btrfs_scrub_progress stat;
100 spinlock_t stat_lock;
101};
102
103static void scrub_free_csums(struct scrub_dev *sdev)
104{
105 while (!list_empty(&sdev->csum_list)) {
106 struct btrfs_ordered_sum *sum;
107 sum = list_first_entry(&sdev->csum_list,
108 struct btrfs_ordered_sum, list);
109 list_del(&sum->list);
110 kfree(sum);
111 }
112}
113
114static void scrub_free_bio(struct bio *bio)
115{
116 int i;
117 struct page *last_page = NULL;
118
119 if (!bio)
120 return;
121
122 for (i = 0; i < bio->bi_vcnt; ++i) {
123 if (bio->bi_io_vec[i].bv_page == last_page)
124 continue;
125 last_page = bio->bi_io_vec[i].bv_page;
126 __free_page(last_page);
127 }
128 bio_put(bio);
129}
130
131static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
132{
133 int i;
134
135 if (!sdev)
136 return;
137
138 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
139 struct scrub_bio *sbio = sdev->bios[i];
140
141 if (!sbio)
142 break;
143
144 scrub_free_bio(sbio->bio);
145 kfree(sbio);
146 }
147
148 scrub_free_csums(sdev);
149 kfree(sdev);
150}
151
152static noinline_for_stack
153struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
154{
155 struct scrub_dev *sdev;
156 int i;
157 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
158
159 sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
160 if (!sdev)
161 goto nomem;
162 sdev->dev = dev;
163 for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
164 struct scrub_bio *sbio;
165
166 sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
167 if (!sbio)
168 goto nomem;
169 sdev->bios[i] = sbio;
170
171 sbio->index = i;
172 sbio->sdev = sdev;
173 sbio->count = 0;
174 sbio->work.func = scrub_checksum;
175
176 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1;
178 else
179 sdev->bios[i]->next_free = -1;
180 }
181 sdev->first_free = 0;
182 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0);
184 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list);
187
188 spin_lock_init(&sdev->list_lock);
189 spin_lock_init(&sdev->stat_lock);
190 init_waitqueue_head(&sdev->list_wait);
191 return sdev;
192
193nomem:
194 scrub_free_dev(sdev);
195 return ERR_PTR(-ENOMEM);
196}
197
198/*
199 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad
203 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{
206 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0)
211 return;
212 }
213 }
214
215 scrub_fixup(sbio, ix);
216}
217
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
219{
220 int ret = 1;
221 struct page *page;
222 void *buffer;
223 u64 flags = sbio->spag[ix].flags;
224
225 page = sbio->bio->bi_io_vec[ix].bv_page;
226 buffer = kmap_atomic(page, KM_USER0);
227 if (flags & BTRFS_EXTENT_FLAG_DATA) {
228 ret = scrub_checksum_data(sbio->sdev,
229 sbio->spag + ix, buffer);
230 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
231 ret = scrub_checksum_tree_block(sbio->sdev,
232 sbio->spag + ix,
233 sbio->logical + ix * PAGE_SIZE,
234 buffer);
235 } else {
236 WARN_ON(1);
237 }
238 kunmap_atomic(buffer, KM_USER0);
239
240 return ret;
241}
242
243static void scrub_fixup_end_io(struct bio *bio, int err)
244{
245 complete((struct completion *)bio->bi_private);
246}
247
248static void scrub_fixup(struct scrub_bio *sbio, int ix)
249{
250 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL;
254 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length;
256 int i;
257 int ret;
258 DECLARE_COMPLETION_ONSTACK(complete);
259
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) {
262 /*
263 * nodatasum, don't try to fix anything
264 * FIXME: we can do better, open the inode and trigger a
265 * writeback
266 */
267 goto uncorrectable;
268 }
269
270 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0);
273 if (ret || !multi || length < PAGE_SIZE) {
274 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical);
277 WARN_ON(1);
278 return;
279 }
280
281 if (multi->num_stripes == 1)
282 /* there aren't any replicas */
283 goto uncorrectable;
284
285 /*
286 * first find a good copy
287 */
288 for (i = 0; i < multi->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num)
290 continue;
291
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */
296 continue;
297 }
298
299 if (scrub_fixup_check(sbio, ix) == 0)
300 break;
301 }
302 if (i == multi->num_stripes)
303 goto uncorrectable;
304
305 if (!sdev->readonly) {
306 /*
307 * bi_io_vec[ix].bv_page now contains good data, write it back
308 */
309 if (scrub_fixup_io(WRITE, sdev->dev->bdev,
310 (sbio->physical + ix * PAGE_SIZE) >> 9,
311 sbio->bio->bi_io_vec[ix].bv_page)) {
312 /* I/O-error, writeback failed, give up */
313 goto uncorrectable;
314 }
315 }
316
317 kfree(multi);
318 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock);
321
322 if (printk_ratelimit())
323 printk(KERN_ERR "btrfs: fixed up at %llu\n",
324 (unsigned long long)logical);
325 return;
326
327uncorrectable:
328 kfree(multi);
329 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock);
332
333 if (printk_ratelimit())
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
335 (unsigned long long)logical);
336}
337
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
339 struct page *page)
340{
341 struct bio *bio = NULL;
342 int ret;
343 DECLARE_COMPLETION_ONSTACK(complete);
344
345 bio = bio_alloc(GFP_NOFS, 1);
346 bio->bi_bdev = bdev;
347 bio->bi_sector = sector;
348 bio_add_page(bio, page, PAGE_SIZE, 0);
349 bio->bi_end_io = scrub_fixup_end_io;
350 bio->bi_private = &complete;
351 submit_bio(rw, bio);
352
353 /* this will also unplug the queue */
354 wait_for_completion(&complete);
355
356 ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
357 bio_put(bio);
358 return ret;
359}
360
361static void scrub_bio_end_io(struct bio *bio, int err)
362{
363 struct scrub_bio *sbio = bio->bi_private;
364 struct scrub_dev *sdev = sbio->sdev;
365 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
366
367 sbio->err = err;
368 sbio->bio = bio;
369
370 btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
371}
372
373static void scrub_checksum(struct btrfs_work *work)
374{
375 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
376 struct scrub_dev *sdev = sbio->sdev;
377 struct page *page;
378 void *buffer;
379 int i;
380 u64 flags;
381 u64 logical;
382 int ret;
383
384 if (sbio->err) {
385 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i);
387
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
390 sbio->bio->bi_phys_segments = 0;
391 sbio->bio->bi_idx = 0;
392
393 for (i = 0; i < sbio->count; i++) {
394 struct bio_vec *bi;
395 bi = &sbio->bio->bi_io_vec[i];
396 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE;
398 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out;
404 }
405 for (i = 0; i < sbio->count; ++i) {
406 page = sbio->bio->bi_io_vec[i].bv_page;
407 buffer = kmap_atomic(page, KM_USER0);
408 flags = sbio->spag[i].flags;
409 logical = sbio->logical + i * PAGE_SIZE;
410 ret = 0;
411 if (flags & BTRFS_EXTENT_FLAG_DATA) {
412 ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
413 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
414 ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
415 logical, buffer);
416 } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
417 BUG_ON(i);
418 (void)scrub_checksum_super(sbio, buffer);
419 } else {
420 WARN_ON(1);
421 }
422 kunmap_atomic(buffer, KM_USER0);
423 if (ret)
424 scrub_recheck_error(sbio, i);
425 }
426
427out:
428 scrub_free_bio(sbio->bio);
429 sbio->bio = NULL;
430 spin_lock(&sdev->list_lock);
431 sbio->next_free = sdev->first_free;
432 sdev->first_free = sbio->index;
433 spin_unlock(&sdev->list_lock);
434 atomic_dec(&sdev->in_flight);
435 wake_up(&sdev->list_wait);
436}
437
438static int scrub_checksum_data(struct scrub_dev *sdev,
439 struct scrub_page *spag, void *buffer)
440{
441 u8 csum[BTRFS_CSUM_SIZE];
442 u32 crc = ~(u32)0;
443 int fail = 0;
444 struct btrfs_root *root = sdev->dev->dev_root;
445
446 if (!spag->have_csum)
447 return 0;
448
449 crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
450 btrfs_csum_final(crc, csum);
451 if (memcmp(csum, spag->csum, sdev->csum_size))
452 fail = 1;
453
454 spin_lock(&sdev->stat_lock);
455 ++sdev->stat.data_extents_scrubbed;
456 sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
457 if (fail)
458 ++sdev->stat.csum_errors;
459 spin_unlock(&sdev->stat_lock);
460
461 return fail;
462}
463
464static int scrub_checksum_tree_block(struct scrub_dev *sdev,
465 struct scrub_page *spag, u64 logical,
466 void *buffer)
467{
468 struct btrfs_header *h;
469 struct btrfs_root *root = sdev->dev->dev_root;
470 struct btrfs_fs_info *fs_info = root->fs_info;
471 u8 csum[BTRFS_CSUM_SIZE];
472 u32 crc = ~(u32)0;
473 int fail = 0;
474 int crc_fail = 0;
475
476 /*
477 * we don't use the getter functions here, as we
478 * a) don't have an extent buffer and
479 * b) the page is already kmapped
480 */
481 h = (struct btrfs_header *)buffer;
482
483 if (logical != le64_to_cpu(h->bytenr))
484 ++fail;
485
486 if (spag->generation != le64_to_cpu(h->generation))
487 ++fail;
488
489 if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
490 ++fail;
491
492 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
493 BTRFS_UUID_SIZE))
494 ++fail;
495
496 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
497 PAGE_SIZE - BTRFS_CSUM_SIZE);
498 btrfs_csum_final(crc, csum);
499 if (memcmp(csum, h->csum, sdev->csum_size))
500 ++crc_fail;
501
502 spin_lock(&sdev->stat_lock);
503 ++sdev->stat.tree_extents_scrubbed;
504 sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
505 if (crc_fail)
506 ++sdev->stat.csum_errors;
507 if (fail)
508 ++sdev->stat.verify_errors;
509 spin_unlock(&sdev->stat_lock);
510
511 return fail || crc_fail;
512}
513
514static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
515{
516 struct btrfs_super_block *s;
517 u64 logical;
518 struct scrub_dev *sdev = sbio->sdev;
519 struct btrfs_root *root = sdev->dev->dev_root;
520 struct btrfs_fs_info *fs_info = root->fs_info;
521 u8 csum[BTRFS_CSUM_SIZE];
522 u32 crc = ~(u32)0;
523 int fail = 0;
524
525 s = (struct btrfs_super_block *)buffer;
526 logical = sbio->logical;
527
528 if (logical != le64_to_cpu(s->bytenr))
529 ++fail;
530
531 if (sbio->spag[0].generation != le64_to_cpu(s->generation))
532 ++fail;
533
534 if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
535 ++fail;
536
537 crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
538 PAGE_SIZE - BTRFS_CSUM_SIZE);
539 btrfs_csum_final(crc, csum);
540 if (memcmp(csum, s->csum, sbio->sdev->csum_size))
541 ++fail;
542
543 if (fail) {
544 /*
545 * if we find an error in a super block, we just report it.
546 * They will get written with the next transaction commit
547 * anyway
548 */
549 spin_lock(&sdev->stat_lock);
550 ++sdev->stat.super_errors;
551 spin_unlock(&sdev->stat_lock);
552 }
553
554 return fail;
555}
556
557static int scrub_submit(struct scrub_dev *sdev)
558{
559 struct scrub_bio *sbio;
560 struct bio *bio;
561 int i;
562
563 if (sdev->curr == -1)
564 return 0;
565
566 sbio = sdev->bios[sdev->curr];
567
568 bio = bio_alloc(GFP_NOFS, sbio->count);
569 if (!bio)
570 goto nomem;
571
572 bio->bi_private = sbio;
573 bio->bi_end_io = scrub_bio_end_io;
574 bio->bi_bdev = sdev->dev->bdev;
575 bio->bi_sector = sbio->physical >> 9;
576
577 for (i = 0; i < sbio->count; ++i) {
578 struct page *page;
579 int ret;
580
581 page = alloc_page(GFP_NOFS);
582 if (!page)
583 goto nomem;
584
585 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
586 if (!ret) {
587 __free_page(page);
588 goto nomem;
589 }
590 }
591
592 sbio->err = 0;
593 sdev->curr = -1;
594 atomic_inc(&sdev->in_flight);
595
596 submit_bio(READ, bio);
597
598 return 0;
599
600nomem:
601 scrub_free_bio(bio);
602
603 return -ENOMEM;
604}
605
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num,
608 u8 *csum, int force)
609{
610 struct scrub_bio *sbio;
611
612again:
613 /*
614 * grab a fresh bio or wait for one to become available
615 */
616 while (sdev->curr == -1) {
617 spin_lock(&sdev->list_lock);
618 sdev->curr = sdev->first_free;
619 if (sdev->curr != -1) {
620 sdev->first_free = sdev->bios[sdev->curr]->next_free;
621 sdev->bios[sdev->curr]->next_free = -1;
622 sdev->bios[sdev->curr]->count = 0;
623 spin_unlock(&sdev->list_lock);
624 } else {
625 spin_unlock(&sdev->list_lock);
626 wait_event(sdev->list_wait, sdev->first_free != -1);
627 }
628 }
629 sbio = sdev->bios[sdev->curr];
630 if (sbio->count == 0) {
631 sbio->physical = physical;
632 sbio->logical = logical;
633 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
634 sbio->logical + sbio->count * PAGE_SIZE != logical) {
635 int ret;
636
637 ret = scrub_submit(sdev);
638 if (ret)
639 return ret;
640 goto again;
641 }
642 sbio->spag[sbio->count].flags = flags;
643 sbio->spag[sbio->count].generation = gen;
644 sbio->spag[sbio->count].have_csum = 0;
645 sbio->spag[sbio->count].mirror_num = mirror_num;
646 if (csum) {
647 sbio->spag[sbio->count].have_csum = 1;
648 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
649 }
650 ++sbio->count;
651 if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
652 int ret;
653
654 ret = scrub_submit(sdev);
655 if (ret)
656 return ret;
657 }
658
659 return 0;
660}
661
662static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
663 u8 *csum)
664{
665 struct btrfs_ordered_sum *sum = NULL;
666 int ret = 0;
667 unsigned long i;
668 unsigned long num_sectors;
669 u32 sectorsize = sdev->dev->dev_root->sectorsize;
670
671 while (!list_empty(&sdev->csum_list)) {
672 sum = list_first_entry(&sdev->csum_list,
673 struct btrfs_ordered_sum, list);
674 if (sum->bytenr > logical)
675 return 0;
676 if (sum->bytenr + sum->len > logical)
677 break;
678
679 ++sdev->stat.csum_discards;
680 list_del(&sum->list);
681 kfree(sum);
682 sum = NULL;
683 }
684 if (!sum)
685 return 0;
686
687 num_sectors = sum->len / sectorsize;
688 for (i = 0; i < num_sectors; ++i) {
689 if (sum->sums[i].bytenr == logical) {
690 memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
691 ret = 1;
692 break;
693 }
694 }
695 if (ret && i == num_sectors - 1) {
696 list_del(&sum->list);
697 kfree(sum);
698 }
699 return ret;
700}
701
702/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num)
705{
706 int ret;
707 u8 csum[BTRFS_CSUM_SIZE];
708
709 while (len) {
710 u64 l = min_t(u64, len, PAGE_SIZE);
711 int have_csum = 0;
712
713 if (flags & BTRFS_EXTENT_FLAG_DATA) {
714 /* push csums to sbio */
715 have_csum = scrub_find_csum(sdev, logical, l, csum);
716 if (have_csum == 0)
717 ++sdev->stat.no_csum;
718 }
719 ret = scrub_page(sdev, logical, l, physical, flags, gen,
720 mirror_num, have_csum ? csum : NULL, 0);
721 if (ret)
722 return ret;
723 len -= l;
724 logical += l;
725 physical += l;
726 }
727 return 0;
728}
729
730static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
731 struct map_lookup *map, int num, u64 base, u64 length)
732{
733 struct btrfs_path *path;
734 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
735 struct btrfs_root *root = fs_info->extent_root;
736 struct btrfs_root *csum_root = fs_info->csum_root;
737 struct btrfs_extent_item *extent;
738 struct blk_plug plug;
739 u64 flags;
740 int ret;
741 int slot;
742 int i;
743 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l;
746 struct btrfs_key key;
747 u64 physical;
748 u64 logical;
749 u64 generation;
750 u64 mirror_num;
751
752 u64 increment = map->stripe_len;
753 u64 offset;
754
755 nstripes = length;
756 offset = 0;
757 do_div(nstripes, map->stripe_len);
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes;
773 } else {
774 increment = map->stripe_len;
775 mirror_num = 0;
776 }
777
778 path = btrfs_alloc_path();
779 if (!path)
780 return -ENOMEM;
781
782 path->reada = 2;
783 path->search_commit_root = 1;
784 path->skip_locking = 1;
785
786 /*
787 * find all extents for each stripe and just read them to get
788 * them into the page cache
789 * FIXME: we can do better. build a more intelligent prefetching
790 */
791 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816
817 break;
818 }
819 btrfs_item_key_to_cpu(l, &key, slot);
820
821 if (key.objectid >= logical + map->stripe_len)
822 break;
823
824 path->slots[0]++;
825 }
826 btrfs_release_path(path);
827 logical += increment;
828 physical += map->stripe_len;
829 cond_resched();
830 }
831
832 /*
833 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB
835 */
836 start_stripe = 0;
837 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846
847 logical += increment;
848 cond_resched();
849 }
850 /*
851 * now find all extents for each stripe and scrub them
852 */
853 logical = base + offset + start_stripe * increment;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len;
855 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) {
857 /*
858 * canceled?
859 */
860 if (atomic_read(&fs_info->scrub_cancel_req) ||
861 atomic_read(&sdev->cancel_req)) {
862 ret = -ECANCELED;
863 goto out;
864 }
865 /*
866 * check to see if we have to pause
867 */
868 if (atomic_read(&fs_info->scrub_pause_req)) {
869 /* push queued extents */
870 scrub_submit(sdev);
871 wait_event(sdev->list_wait,
872 atomic_read(&sdev->in_flight) == 0);
873 atomic_inc(&fs_info->scrubs_paused);
874 wake_up(&fs_info->scrub_pause_wait);
875 mutex_lock(&fs_info->scrub_lock);
876 while (atomic_read(&fs_info->scrub_pause_req)) {
877 mutex_unlock(&fs_info->scrub_lock);
878 wait_event(fs_info->scrub_pause_wait,
879 atomic_read(&fs_info->scrub_pause_req) == 0);
880 mutex_lock(&fs_info->scrub_lock);
881 }
882 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 }
889
890 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0;
893
894 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
895 if (ret < 0)
896 goto out;
897 if (ret > 0) {
898 ret = btrfs_previous_item(root, path, 0,
899 BTRFS_EXTENT_ITEM_KEY);
900 if (ret < 0)
901 goto out;
902 if (ret > 0) {
903 /* there's no smaller item, so stick with the
904 * larger one */
905 btrfs_release_path(path);
906 ret = btrfs_search_slot(NULL, root, &key,
907 path, 0, 0);
908 if (ret < 0)
909 goto out;
910 }
911 }
912
913 while (1) {
914 l = path->nodes[0];
915 slot = path->slots[0];
916 if (slot >= btrfs_header_nritems(l)) {
917 ret = btrfs_next_leaf(root, path);
918 if (ret == 0)
919 continue;
920 if (ret < 0)
921 goto out;
922
923 break;
924 }
925 btrfs_item_key_to_cpu(l, &key, slot);
926
927 if (key.objectid + key.offset <= logical)
928 goto next;
929
930 if (key.objectid >= logical + map->stripe_len)
931 break;
932
933 if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
934 goto next;
935
936 extent = btrfs_item_ptr(l, slot,
937 struct btrfs_extent_item);
938 flags = btrfs_extent_flags(l, extent);
939 generation = btrfs_extent_generation(l, extent);
940
941 if (key.objectid < logical &&
942 (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
943 printk(KERN_ERR
944 "btrfs scrub: tree block %llu spanning "
945 "stripes, ignored. logical=%llu\n",
946 (unsigned long long)key.objectid,
947 (unsigned long long)logical);
948 goto next;
949 }
950
951 /*
952 * trim extent to this stripe
953 */
954 if (key.objectid < logical) {
955 key.offset -= logical - key.objectid;
956 key.objectid = logical;
957 }
958 if (key.objectid + key.offset >
959 logical + map->stripe_len) {
960 key.offset = logical + map->stripe_len -
961 key.objectid;
962 }
963
964 ret = scrub_extent(sdev, key.objectid, key.offset,
965 key.objectid - logical + physical,
966 flags, generation, mirror_num);
967 if (ret)
968 goto out;
969
970next:
971 path->slots[0]++;
972 }
973 btrfs_release_path(path);
974 logical += increment;
975 physical += map->stripe_len;
976 spin_lock(&sdev->stat_lock);
977 sdev->stat.last_physical = physical;
978 spin_unlock(&sdev->stat_lock);
979 }
980 /* push queued extents */
981 scrub_submit(sdev);
982
983out:
984 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path);
987 return ret < 0 ? ret : 0;
988}
989
990static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
991 u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
992{
993 struct btrfs_mapping_tree *map_tree =
994 &sdev->dev->dev_root->fs_info->mapping_tree;
995 struct map_lookup *map;
996 struct extent_map *em;
997 int i;
998 int ret = -EINVAL;
999
1000 read_lock(&map_tree->map_tree.lock);
1001 em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
1002 read_unlock(&map_tree->map_tree.lock);
1003
1004 if (!em)
1005 return -EINVAL;
1006
1007 map = (struct map_lookup *)em->bdev;
1008 if (em->start != chunk_offset)
1009 goto out;
1010
1011 if (em->len < length)
1012 goto out;
1013
1014 for (i = 0; i < map->num_stripes; ++i) {
1015 if (map->stripes[i].dev == sdev->dev) {
1016 ret = scrub_stripe(sdev, map, i, chunk_offset, length);
1017 if (ret)
1018 goto out;
1019 }
1020 }
1021out:
1022 free_extent_map(em);
1023
1024 return ret;
1025}
1026
1027static noinline_for_stack
1028int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
1029{
1030 struct btrfs_dev_extent *dev_extent = NULL;
1031 struct btrfs_path *path;
1032 struct btrfs_root *root = sdev->dev->dev_root;
1033 struct btrfs_fs_info *fs_info = root->fs_info;
1034 u64 length;
1035 u64 chunk_tree;
1036 u64 chunk_objectid;
1037 u64 chunk_offset;
1038 int ret;
1039 int slot;
1040 struct extent_buffer *l;
1041 struct btrfs_key key;
1042 struct btrfs_key found_key;
1043 struct btrfs_block_group_cache *cache;
1044
1045 path = btrfs_alloc_path();
1046 if (!path)
1047 return -ENOMEM;
1048
1049 path->reada = 2;
1050 path->search_commit_root = 1;
1051 path->skip_locking = 1;
1052
1053 key.objectid = sdev->dev->devid;
1054 key.offset = 0ull;
1055 key.type = BTRFS_DEV_EXTENT_KEY;
1056
1057
1058 while (1) {
1059 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1060 if (ret < 0)
1061 break;
1062 if (ret > 0) {
1063 if (path->slots[0] >=
1064 btrfs_header_nritems(path->nodes[0])) {
1065 ret = btrfs_next_leaf(root, path);
1066 if (ret)
1067 break;
1068 }
1069 }
1070
1071 l = path->nodes[0];
1072 slot = path->slots[0];
1073
1074 btrfs_item_key_to_cpu(l, &found_key, slot);
1075
1076 if (found_key.objectid != sdev->dev->devid)
1077 break;
1078
1079 if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
1080 break;
1081
1082 if (found_key.offset >= end)
1083 break;
1084
1085 if (found_key.offset < key.offset)
1086 break;
1087
1088 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1089 length = btrfs_dev_extent_length(l, dev_extent);
1090
1091 if (found_key.offset + length <= start) {
1092 key.offset = found_key.offset + length;
1093 btrfs_release_path(path);
1094 continue;
1095 }
1096
1097 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
1098 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
1099 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
1100
1101 /*
1102 * get a reference on the corresponding block group to prevent
1103 * the chunk from going away while we scrub it
1104 */
1105 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
1106 if (!cache) {
1107 ret = -ENOENT;
1108 break;
1109 }
1110 ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
1111 chunk_offset, length);
1112 btrfs_put_block_group(cache);
1113 if (ret)
1114 break;
1115
1116 key.offset = found_key.offset + length;
1117 btrfs_release_path(path);
1118 }
1119
1120 btrfs_free_path(path);
1121
1122 /*
1123 * ret can still be 1 from search_slot or next_leaf,
1124 * that's not an error
1125 */
1126 return ret < 0 ? ret : 0;
1127}
1128
1129static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
1130{
1131 int i;
1132 u64 bytenr;
1133 u64 gen;
1134 int ret;
1135 struct btrfs_device *device = sdev->dev;
1136 struct btrfs_root *root = device->dev_root;
1137
1138 gen = root->fs_info->last_trans_committed;
1139
1140 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1141 bytenr = btrfs_sb_offset(i);
1142 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
1143 break;
1144
1145 ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
1146 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
1147 if (ret)
1148 return ret;
1149 }
1150 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1151
1152 return 0;
1153}
1154
1155/*
1156 * get a reference count on fs_info->scrub_workers. start worker if necessary
1157 */
1158static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
1159{
1160 struct btrfs_fs_info *fs_info = root->fs_info;
1161
1162 mutex_lock(&fs_info->scrub_lock);
1163 if (fs_info->scrub_workers_refcnt == 0) {
1164 btrfs_init_workers(&fs_info->scrub_workers, "scrub",
1165 fs_info->thread_pool_size, &fs_info->generic_worker);
1166 fs_info->scrub_workers.idle_thresh = 4;
1167 btrfs_start_workers(&fs_info->scrub_workers, 1);
1168 }
1169 ++fs_info->scrub_workers_refcnt;
1170 mutex_unlock(&fs_info->scrub_lock);
1171
1172 return 0;
1173}
1174
1175static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
1176{
1177 struct btrfs_fs_info *fs_info = root->fs_info;
1178
1179 mutex_lock(&fs_info->scrub_lock);
1180 if (--fs_info->scrub_workers_refcnt == 0)
1181 btrfs_stop_workers(&fs_info->scrub_workers);
1182 WARN_ON(fs_info->scrub_workers_refcnt < 0);
1183 mutex_unlock(&fs_info->scrub_lock);
1184}
1185
1186
1187int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1188 struct btrfs_scrub_progress *progress, int readonly)
1189{
1190 struct scrub_dev *sdev;
1191 struct btrfs_fs_info *fs_info = root->fs_info;
1192 int ret;
1193 struct btrfs_device *dev;
1194
1195 if (btrfs_fs_closing(root->fs_info))
1196 return -EINVAL;
1197
1198 /*
1199 * check some assumptions
1200 */
1201 if (root->sectorsize != PAGE_SIZE ||
1202 root->sectorsize != root->leafsize ||
1203 root->sectorsize != root->nodesize) {
1204 printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
1205 return -EINVAL;
1206 }
1207
1208 ret = scrub_workers_get(root);
1209 if (ret)
1210 return ret;
1211
1212 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1213 dev = btrfs_find_device(root, devid, NULL, NULL);
1214 if (!dev || dev->missing) {
1215 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1216 scrub_workers_put(root);
1217 return -ENODEV;
1218 }
1219 mutex_lock(&fs_info->scrub_lock);
1220
1221 if (!dev->in_fs_metadata) {
1222 mutex_unlock(&fs_info->scrub_lock);
1223 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1224 scrub_workers_put(root);
1225 return -ENODEV;
1226 }
1227
1228 if (dev->scrub_device) {
1229 mutex_unlock(&fs_info->scrub_lock);
1230 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1231 scrub_workers_put(root);
1232 return -EINPROGRESS;
1233 }
1234 sdev = scrub_setup_dev(dev);
1235 if (IS_ERR(sdev)) {
1236 mutex_unlock(&fs_info->scrub_lock);
1237 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1238 scrub_workers_put(root);
1239 return PTR_ERR(sdev);
1240 }
1241 sdev->readonly = readonly;
1242 dev->scrub_device = sdev;
1243
1244 atomic_inc(&fs_info->scrubs_running);
1245 mutex_unlock(&fs_info->scrub_lock);
1246 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1247
1248 down_read(&fs_info->scrub_super_lock);
1249 ret = scrub_supers(sdev);
1250 up_read(&fs_info->scrub_super_lock);
1251
1252 if (!ret)
1253 ret = scrub_enumerate_chunks(sdev, start, end);
1254
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait);
1259
1260 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress));
1262
1263 mutex_lock(&fs_info->scrub_lock);
1264 dev->scrub_device = NULL;
1265 mutex_unlock(&fs_info->scrub_lock);
1266
1267 scrub_free_dev(sdev);
1268 scrub_workers_put(root);
1269
1270 return ret;
1271}
1272
1273int btrfs_scrub_pause(struct btrfs_root *root)
1274{
1275 struct btrfs_fs_info *fs_info = root->fs_info;
1276
1277 mutex_lock(&fs_info->scrub_lock);
1278 atomic_inc(&fs_info->scrub_pause_req);
1279 while (atomic_read(&fs_info->scrubs_paused) !=
1280 atomic_read(&fs_info->scrubs_running)) {
1281 mutex_unlock(&fs_info->scrub_lock);
1282 wait_event(fs_info->scrub_pause_wait,
1283 atomic_read(&fs_info->scrubs_paused) ==
1284 atomic_read(&fs_info->scrubs_running));
1285 mutex_lock(&fs_info->scrub_lock);
1286 }
1287 mutex_unlock(&fs_info->scrub_lock);
1288
1289 return 0;
1290}
1291
1292int btrfs_scrub_continue(struct btrfs_root *root)
1293{
1294 struct btrfs_fs_info *fs_info = root->fs_info;
1295
1296 atomic_dec(&fs_info->scrub_pause_req);
1297 wake_up(&fs_info->scrub_pause_wait);
1298 return 0;
1299}
1300
1301int btrfs_scrub_pause_super(struct btrfs_root *root)
1302{
1303 down_write(&root->fs_info->scrub_super_lock);
1304 return 0;
1305}
1306
1307int btrfs_scrub_continue_super(struct btrfs_root *root)
1308{
1309 up_write(&root->fs_info->scrub_super_lock);
1310 return 0;
1311}
1312
1313int btrfs_scrub_cancel(struct btrfs_root *root)
1314{
1315 struct btrfs_fs_info *fs_info = root->fs_info;
1316
1317 mutex_lock(&fs_info->scrub_lock);
1318 if (!atomic_read(&fs_info->scrubs_running)) {
1319 mutex_unlock(&fs_info->scrub_lock);
1320 return -ENOTCONN;
1321 }
1322
1323 atomic_inc(&fs_info->scrub_cancel_req);
1324 while (atomic_read(&fs_info->scrubs_running)) {
1325 mutex_unlock(&fs_info->scrub_lock);
1326 wait_event(fs_info->scrub_pause_wait,
1327 atomic_read(&fs_info->scrubs_running) == 0);
1328 mutex_lock(&fs_info->scrub_lock);
1329 }
1330 atomic_dec(&fs_info->scrub_cancel_req);
1331 mutex_unlock(&fs_info->scrub_lock);
1332
1333 return 0;
1334}
1335
1336int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
1337{
1338 struct btrfs_fs_info *fs_info = root->fs_info;
1339 struct scrub_dev *sdev;
1340
1341 mutex_lock(&fs_info->scrub_lock);
1342 sdev = dev->scrub_device;
1343 if (!sdev) {
1344 mutex_unlock(&fs_info->scrub_lock);
1345 return -ENOTCONN;
1346 }
1347 atomic_inc(&sdev->cancel_req);
1348 while (dev->scrub_device) {
1349 mutex_unlock(&fs_info->scrub_lock);
1350 wait_event(fs_info->scrub_pause_wait,
1351 dev->scrub_device == NULL);
1352 mutex_lock(&fs_info->scrub_lock);
1353 }
1354 mutex_unlock(&fs_info->scrub_lock);
1355
1356 return 0;
1357}
1358int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
1359{
1360 struct btrfs_fs_info *fs_info = root->fs_info;
1361 struct btrfs_device *dev;
1362 int ret;
1363
1364 /*
1365 * we have to hold the device_list_mutex here so the device
1366 * does not go away in cancel_dev. FIXME: find a better solution
1367 */
1368 mutex_lock(&fs_info->fs_devices->device_list_mutex);
1369 dev = btrfs_find_device(root, devid, NULL, NULL);
1370 if (!dev) {
1371 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1372 return -ENODEV;
1373 }
1374 ret = btrfs_scrub_cancel_dev(root, dev);
1375 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1376
1377 return ret;
1378}
1379
1380int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
1381 struct btrfs_scrub_progress *progress)
1382{
1383 struct btrfs_device *dev;
1384 struct scrub_dev *sdev = NULL;
1385
1386 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1387 dev = btrfs_find_device(root, devid, NULL, NULL);
1388 if (dev)
1389 sdev = dev->scrub_device;
1390 if (sdev)
1391 memcpy(progress, &sdev->stat, sizeof(*progress));
1392 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1393
1394 return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
1395}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1776dbd8dc98..15634d4648d7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,7 +39,9 @@
39#include <linux/miscdevice.h> 39#include <linux/miscdevice.h>
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h>
42#include "compat.h" 43#include "compat.h"
44#include "delayed-inode.h"
43#include "ctree.h" 45#include "ctree.h"
44#include "disk-io.h" 46#include "disk-io.h"
45#include "transaction.h" 47#include "transaction.h"
@@ -52,8 +54,95 @@
52#include "export.h" 54#include "export.h"
53#include "compression.h" 55#include "compression.h"
54 56
57#define CREATE_TRACE_POINTS
58#include <trace/events/btrfs.h>
59
55static const struct super_operations btrfs_super_ops; 60static const struct super_operations btrfs_super_ops;
56 61
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16])
64{
65 char *errstr = NULL;
66
67 switch (errno) {
68 case -EIO:
69 errstr = "IO failure";
70 break;
71 case -ENOMEM:
72 errstr = "Out of memory";
73 break;
74 case -EROFS:
75 errstr = "Readonly filesystem";
76 break;
77 default:
78 if (nbuf) {
79 if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
80 errstr = nbuf;
81 }
82 break;
83 }
84
85 return errstr;
86}
87
88static void __save_error_info(struct btrfs_fs_info *fs_info)
89{
90 /*
91 * today we only save the error info into ram. Long term we'll
92 * also send it down to the disk
93 */
94 fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
95}
96
97/* NOTE:
98 * We move write_super stuff at umount in order to avoid deadlock
99 * for umount hold all lock.
100 */
101static void save_error_info(struct btrfs_fs_info *fs_info)
102{
103 __save_error_info(fs_info);
104}
105
106/* btrfs handle error by forcing the filesystem readonly */
107static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
108{
109 struct super_block *sb = fs_info->sb;
110
111 if (sb->s_flags & MS_RDONLY)
112 return;
113
114 if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
115 sb->s_flags |= MS_RDONLY;
116 printk(KERN_INFO "btrfs is forced readonly\n");
117 }
118}
119
120/*
121 * __btrfs_std_error decodes expected errors from the caller and
122 * invokes the approciate error response.
123 */
124void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
125 unsigned int line, int errno)
126{
127 struct super_block *sb = fs_info->sb;
128 char nbuf[16];
129 const char *errstr;
130
131 /*
132 * Special case: if the error is EROFS, and we're already
133 * under MS_RDONLY, then it is safe here.
134 */
135 if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
136 return;
137
138 errstr = btrfs_decode_error(fs_info, errno, nbuf);
139 printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
140 sb->s_id, function, line, errstr);
141 save_error_info(fs_info);
142
143 btrfs_handle_error(fs_info);
144}
145
57static void btrfs_put_super(struct super_block *sb) 146static void btrfs_put_super(struct super_block *sb)
58{ 147{
59 struct btrfs_root *root = btrfs_sb(sb); 148 struct btrfs_root *root = btrfs_sb(sb);
@@ -61,14 +150,19 @@ static void btrfs_put_super(struct super_block *sb)
61 150
62 ret = close_ctree(root); 151 ret = close_ctree(root);
63 sb->s_fs_info = NULL; 152 sb->s_fs_info = NULL;
153
154 (void)ret; /* FIXME: need to fix VFS to return error? */
64} 155}
65 156
66enum { 157enum {
67 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum, 158 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
68 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, 159 Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
69 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, 160 Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
70 Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit, 161 Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
71 Opt_discard, Opt_err, 162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err,
72}; 166};
73 167
74static match_table_t tokens = { 168static match_table_t tokens = {
@@ -83,7 +177,9 @@ static match_table_t tokens = {
83 {Opt_alloc_start, "alloc_start=%s"}, 177 {Opt_alloc_start, "alloc_start=%s"},
84 {Opt_thread_pool, "thread_pool=%d"}, 178 {Opt_thread_pool, "thread_pool=%d"},
85 {Opt_compress, "compress"}, 179 {Opt_compress, "compress"},
180 {Opt_compress_type, "compress=%s"},
86 {Opt_compress_force, "compress-force"}, 181 {Opt_compress_force, "compress-force"},
182 {Opt_compress_force_type, "compress-force=%s"},
87 {Opt_ssd, "ssd"}, 183 {Opt_ssd, "ssd"},
88 {Opt_ssd_spread, "ssd_spread"}, 184 {Opt_ssd_spread, "ssd_spread"},
89 {Opt_nossd, "nossd"}, 185 {Opt_nossd, "nossd"},
@@ -92,6 +188,13 @@ static match_table_t tokens = {
92 {Opt_flushoncommit, "flushoncommit"}, 188 {Opt_flushoncommit, "flushoncommit"},
93 {Opt_ratio, "metadata_ratio=%d"}, 189 {Opt_ratio, "metadata_ratio=%d"},
94 {Opt_discard, "discard"}, 190 {Opt_discard, "discard"},
191 {Opt_space_cache, "space_cache"},
192 {Opt_clear_cache, "clear_cache"},
193 {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
194 {Opt_enospc_debug, "enospc_debug"},
195 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"},
95 {Opt_err, NULL}, 198 {Opt_err, NULL},
96}; 199};
97 200
@@ -106,6 +209,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
106 char *p, *num, *orig; 209 char *p, *num, *orig;
107 int intarg; 210 int intarg;
108 int ret = 0; 211 int ret = 0;
212 char *compress_type;
213 bool compress_force = false;
109 214
110 if (!options) 215 if (!options)
111 return 0; 216 return 0;
@@ -133,6 +238,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
133 break; 238 break;
134 case Opt_subvol: 239 case Opt_subvol:
135 case Opt_subvolid: 240 case Opt_subvolid:
241 case Opt_subvolrootid:
136 case Opt_device: 242 case Opt_device:
137 /* 243 /*
138 * These are parsed by btrfs_parse_early_options 244 * These are parsed by btrfs_parse_early_options
@@ -148,14 +254,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
148 btrfs_set_opt(info->mount_opt, NODATACOW); 254 btrfs_set_opt(info->mount_opt, NODATACOW);
149 btrfs_set_opt(info->mount_opt, NODATASUM); 255 btrfs_set_opt(info->mount_opt, NODATASUM);
150 break; 256 break;
151 case Opt_compress:
152 printk(KERN_INFO "btrfs: use compression\n");
153 btrfs_set_opt(info->mount_opt, COMPRESS);
154 break;
155 case Opt_compress_force: 257 case Opt_compress_force:
156 printk(KERN_INFO "btrfs: forcing compression\n"); 258 case Opt_compress_force_type:
157 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); 259 compress_force = true;
260 case Opt_compress:
261 case Opt_compress_type:
262 if (token == Opt_compress ||
263 token == Opt_compress_force ||
264 strcmp(args[0].from, "zlib") == 0) {
265 compress_type = "zlib";
266 info->compress_type = BTRFS_COMPRESS_ZLIB;
267 } else if (strcmp(args[0].from, "lzo") == 0) {
268 compress_type = "lzo";
269 info->compress_type = BTRFS_COMPRESS_LZO;
270 } else {
271 ret = -EINVAL;
272 goto out;
273 }
274
158 btrfs_set_opt(info->mount_opt, COMPRESS); 275 btrfs_set_opt(info->mount_opt, COMPRESS);
276 if (compress_force) {
277 btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
278 pr_info("btrfs: force %s compression\n",
279 compress_type);
280 } else
281 pr_info("btrfs: use %s compression\n",
282 compress_type);
159 break; 283 break;
160 case Opt_ssd: 284 case Opt_ssd:
161 printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); 285 printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -235,6 +359,28 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
235 case Opt_discard: 359 case Opt_discard:
236 btrfs_set_opt(info->mount_opt, DISCARD); 360 btrfs_set_opt(info->mount_opt, DISCARD);
237 break; 361 break;
362 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break;
366 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
369 break;
370 case Opt_clear_cache:
371 printk(KERN_INFO "btrfs: force clearing of disk cache\n");
372 btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
373 break;
374 case Opt_user_subvol_rm_allowed:
375 btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
376 break;
377 case Opt_enospc_debug:
378 btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
379 break;
380 case Opt_defrag:
381 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break;
238 case Opt_err: 384 case Opt_err:
239 printk(KERN_INFO "btrfs: unrecognized mount option " 385 printk(KERN_INFO "btrfs: unrecognized mount option "
240 "'%s'\n", p); 386 "'%s'\n", p);
@@ -257,10 +403,10 @@ out:
257 */ 403 */
258static int btrfs_parse_early_options(const char *options, fmode_t flags, 404static int btrfs_parse_early_options(const char *options, fmode_t flags,
259 void *holder, char **subvol_name, u64 *subvol_objectid, 405 void *holder, char **subvol_name, u64 *subvol_objectid,
260 struct btrfs_fs_devices **fs_devices) 406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
261{ 407{
262 substring_t args[MAX_OPT_ARGS]; 408 substring_t args[MAX_OPT_ARGS];
263 char *opts, *p; 409 char *opts, *orig, *p;
264 int error = 0; 410 int error = 0;
265 int intarg; 411 int intarg;
266 412
@@ -274,6 +420,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
274 opts = kstrdup(options, GFP_KERNEL); 420 opts = kstrdup(options, GFP_KERNEL);
275 if (!opts) 421 if (!opts)
276 return -ENOMEM; 422 return -ENOMEM;
423 orig = opts;
277 424
278 while ((p = strsep(&opts, ",")) != NULL) { 425 while ((p = strsep(&opts, ",")) != NULL) {
279 int token; 426 int token;
@@ -297,6 +444,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
297 *subvol_objectid = intarg; 444 *subvol_objectid = intarg;
298 } 445 }
299 break; 446 break;
447 case Opt_subvolrootid:
448 intarg = 0;
449 error = match_int(&args[0], &intarg);
450 if (!error) {
451 /* we want the original fs_tree */
452 if (!intarg)
453 *subvol_rootid =
454 BTRFS_FS_TREE_OBJECTID;
455 else
456 *subvol_rootid = intarg;
457 }
458 break;
300 case Opt_device: 459 case Opt_device:
301 error = btrfs_scan_one_device(match_strdup(&args[0]), 460 error = btrfs_scan_one_device(match_strdup(&args[0]),
302 flags, holder, fs_devices); 461 flags, holder, fs_devices);
@@ -309,7 +468,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
309 } 468 }
310 469
311 out_free_opts: 470 out_free_opts:
312 kfree(opts); 471 kfree(orig);
313 out: 472 out:
314 /* 473 /*
315 * If no subvolume name is specified we use the default one. Allocate 474 * If no subvolume name is specified we use the default one. Allocate
@@ -360,8 +519,10 @@ static struct dentry *get_default_root(struct super_block *sb,
360 */ 519 */
361 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
362 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
363 if (IS_ERR(di)) 522 if (IS_ERR(di)) {
523 btrfs_free_path(path);
364 return ERR_CAST(di); 524 return ERR_CAST(di);
525 }
365 if (!di) { 526 if (!di) {
366 /* 527 /*
367 * Ok the default dir item isn't there. This is weird since 528 * Ok the default dir item isn't there. This is weird since
@@ -380,7 +541,7 @@ static struct dentry *get_default_root(struct super_block *sb,
380find_root: 541find_root:
381 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location); 542 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
382 if (IS_ERR(new_root)) 543 if (IS_ERR(new_root))
383 return ERR_PTR(PTR_ERR(new_root)); 544 return ERR_CAST(new_root);
384 545
385 if (btrfs_root_refs(&new_root->root_item) == 0) 546 if (btrfs_root_refs(&new_root->root_item) == 0)
386 return ERR_PTR(-ENOENT); 547 return ERR_PTR(-ENOENT);
@@ -436,7 +597,6 @@ static int btrfs_fill_super(struct super_block *sb,
436{ 597{
437 struct inode *inode; 598 struct inode *inode;
438 struct dentry *root_dentry; 599 struct dentry *root_dentry;
439 struct btrfs_super_block *disk_super;
440 struct btrfs_root *tree_root; 600 struct btrfs_root *tree_root;
441 struct btrfs_key key; 601 struct btrfs_key key;
442 int err; 602 int err;
@@ -444,6 +604,7 @@ static int btrfs_fill_super(struct super_block *sb,
444 sb->s_maxbytes = MAX_LFS_FILESIZE; 604 sb->s_maxbytes = MAX_LFS_FILESIZE;
445 sb->s_magic = BTRFS_SUPER_MAGIC; 605 sb->s_magic = BTRFS_SUPER_MAGIC;
446 sb->s_op = &btrfs_super_ops; 606 sb->s_op = &btrfs_super_ops;
607 sb->s_d_op = &btrfs_dentry_operations;
447 sb->s_export_op = &btrfs_export_ops; 608 sb->s_export_op = &btrfs_export_ops;
448 sb->s_xattr = btrfs_xattr_handlers; 609 sb->s_xattr = btrfs_xattr_handlers;
449 sb->s_time_gran = 1; 610 sb->s_time_gran = 1;
@@ -458,7 +619,6 @@ static int btrfs_fill_super(struct super_block *sb,
458 return PTR_ERR(tree_root); 619 return PTR_ERR(tree_root);
459 } 620 }
460 sb->s_fs_info = tree_root; 621 sb->s_fs_info = tree_root;
461 disk_super = &tree_root->fs_info->super_copy;
462 622
463 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 623 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
464 key.type = BTRFS_INODE_ITEM_KEY; 624 key.type = BTRFS_INODE_ITEM_KEY;
@@ -479,6 +639,7 @@ static int btrfs_fill_super(struct super_block *sb,
479 sb->s_root = root_dentry; 639 sb->s_root = root_dentry;
480 640
481 save_mount_options(sb, data); 641 save_mount_options(sb, data);
642 cleancache_init_fs(sb);
482 return 0; 643 return 0;
483 644
484fail_close: 645fail_close:
@@ -492,6 +653,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
492 struct btrfs_root *root = btrfs_sb(sb); 653 struct btrfs_root *root = btrfs_sb(sb);
493 int ret; 654 int ret;
494 655
656 trace_btrfs_sync_fs(wait);
657
495 if (!wait) { 658 if (!wait) {
496 filemap_flush(root->fs_info->btree_inode->i_mapping); 659 filemap_flush(root->fs_info->btree_inode->i_mapping);
497 return 0; 660 return 0;
@@ -501,6 +664,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
501 btrfs_wait_ordered_extents(root, 0, 0); 664 btrfs_wait_ordered_extents(root, 0, 0);
502 665
503 trans = btrfs_start_transaction(root, 0); 666 trans = btrfs_start_transaction(root, 0);
667 if (IS_ERR(trans))
668 return PTR_ERR(trans);
504 ret = btrfs_commit_transaction(trans, root); 669 ret = btrfs_commit_transaction(trans, root);
505 return ret; 670 return ret;
506} 671}
@@ -509,6 +674,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
509{ 674{
510 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb); 675 struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
511 struct btrfs_fs_info *info = root->fs_info; 676 struct btrfs_fs_info *info = root->fs_info;
677 char *compress_type;
512 678
513 if (btrfs_test_opt(root, DEGRADED)) 679 if (btrfs_test_opt(root, DEGRADED))
514 seq_puts(seq, ",degraded"); 680 seq_puts(seq, ",degraded");
@@ -527,8 +693,16 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
527 if (info->thread_pool_size != min_t(unsigned long, 693 if (info->thread_pool_size != min_t(unsigned long,
528 num_online_cpus() + 2, 8)) 694 num_online_cpus() + 2, 8))
529 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size); 695 seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
530 if (btrfs_test_opt(root, COMPRESS)) 696 if (btrfs_test_opt(root, COMPRESS)) {
531 seq_puts(seq, ",compress"); 697 if (info->compress_type == BTRFS_COMPRESS_ZLIB)
698 compress_type = "zlib";
699 else
700 compress_type = "lzo";
701 if (btrfs_test_opt(root, FORCE_COMPRESS))
702 seq_printf(seq, ",compress-force=%s", compress_type);
703 else
704 seq_printf(seq, ",compress=%s", compress_type);
705 }
532 if (btrfs_test_opt(root, NOSSD)) 706 if (btrfs_test_opt(root, NOSSD))
533 seq_puts(seq, ",nossd"); 707 seq_puts(seq, ",nossd");
534 if (btrfs_test_opt(root, SSD_SPREAD)) 708 if (btrfs_test_opt(root, SSD_SPREAD))
@@ -543,46 +717,74 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
543 seq_puts(seq, ",discard"); 717 seq_puts(seq, ",discard");
544 if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) 718 if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
545 seq_puts(seq, ",noacl"); 719 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
725 seq_puts(seq, ",user_subvol_rm_allowed");
726 if (btrfs_test_opt(root, ENOSPC_DEBUG))
727 seq_puts(seq, ",enospc_debug");
728 if (btrfs_test_opt(root, AUTO_DEFRAG))
729 seq_puts(seq, ",autodefrag");
730 if (btrfs_test_opt(root, INODE_MAP_CACHE))
731 seq_puts(seq, ",inode_cache");
546 return 0; 732 return 0;
547} 733}
548 734
549static int btrfs_test_super(struct super_block *s, void *data) 735static int btrfs_test_super(struct super_block *s, void *data)
550{ 736{
551 struct btrfs_fs_devices *test_fs_devices = data; 737 struct btrfs_root *test_root = data;
552 struct btrfs_root *root = btrfs_sb(s); 738 struct btrfs_root *root = btrfs_sb(s);
553 739
554 return root->fs_info->fs_devices == test_fs_devices; 740 /*
741 * If this super block is going away, return false as it
742 * can't match as an existing super block.
743 */
744 if (!atomic_read(&s->s_active))
745 return 0;
746 return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
555} 747}
556 748
749static int btrfs_set_super(struct super_block *s, void *data)
750{
751 s->s_fs_info = data;
752
753 return set_anon_super(s, data);
754}
755
756
557/* 757/*
558 * Find a superblock for the given device / mount point. 758 * Find a superblock for the given device / mount point.
559 * 759 *
560 * Note: This is based on get_sb_bdev from fs/super.c with a few additions 760 * Note: This is based on get_sb_bdev from fs/super.c with a few additions
561 * for multiple device setup. Make sure to keep it in sync. 761 * for multiple device setup. Make sure to keep it in sync.
562 */ 762 */
563static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 763static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
564 const char *dev_name, void *data, struct vfsmount *mnt) 764 const char *device_name, void *data)
565{ 765{
566 struct block_device *bdev = NULL; 766 struct block_device *bdev = NULL;
567 struct super_block *s; 767 struct super_block *s;
568 struct dentry *root; 768 struct dentry *root;
569 struct btrfs_fs_devices *fs_devices = NULL; 769 struct btrfs_fs_devices *fs_devices = NULL;
770 struct btrfs_root *tree_root = NULL;
771 struct btrfs_fs_info *fs_info = NULL;
570 fmode_t mode = FMODE_READ; 772 fmode_t mode = FMODE_READ;
571 char *subvol_name = NULL; 773 char *subvol_name = NULL;
572 u64 subvol_objectid = 0; 774 u64 subvol_objectid = 0;
775 u64 subvol_rootid = 0;
573 int error = 0; 776 int error = 0;
574 int found = 0;
575 777
576 if (!(flags & MS_RDONLY)) 778 if (!(flags & MS_RDONLY))
577 mode |= FMODE_WRITE; 779 mode |= FMODE_WRITE;
578 780
579 error = btrfs_parse_early_options(data, mode, fs_type, 781 error = btrfs_parse_early_options(data, mode, fs_type,
580 &subvol_name, &subvol_objectid, 782 &subvol_name, &subvol_objectid,
581 &fs_devices); 783 &subvol_rootid, &fs_devices);
582 if (error) 784 if (error)
583 return error; 785 return ERR_PTR(error);
584 786
585 error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); 787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
586 if (error) 788 if (error)
587 goto error_free_subvol_name; 789 goto error_free_subvol_name;
588 790
@@ -595,8 +797,24 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
595 goto error_close_devices; 797 goto error_close_devices;
596 } 798 }
597 799
800 /*
801 * Setup a dummy root and fs_info for test/set super. This is because
802 * we don't actually fill this stuff out until open_ctree, but we need
803 * it for searching for existing supers, so this lets us do that and
804 * then open_ctree will properly initialize everything later.
805 */
806 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
807 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
808 if (!fs_info || !tree_root) {
809 error = -ENOMEM;
810 goto error_close_devices;
811 }
812 fs_info->tree_root = tree_root;
813 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info;
815
598 bdev = fs_devices->latest_bdev; 816 bdev = fs_devices->latest_bdev;
599 s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); 817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
600 if (IS_ERR(s)) 818 if (IS_ERR(s))
601 goto error_s; 819 goto error_s;
602 820
@@ -607,12 +825,13 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
607 goto error_close_devices; 825 goto error_close_devices;
608 } 826 }
609 827
610 found = 1;
611 btrfs_close_devices(fs_devices); 828 btrfs_close_devices(fs_devices);
829 kfree(fs_info);
830 kfree(tree_root);
612 } else { 831 } else {
613 char b[BDEVNAME_SIZE]; 832 char b[BDEVNAME_SIZE];
614 833
615 s->s_flags = flags; 834 s->s_flags = flags | MS_NOSEC;
616 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
617 error = btrfs_fill_super(s, fs_devices, data, 836 error = btrfs_fill_super(s, fs_devices, data,
618 flags & MS_SILENT ? 1 : 0); 837 flags & MS_SILENT ? 1 : 0);
@@ -625,51 +844,58 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
625 s->s_flags |= MS_ACTIVE; 844 s->s_flags |= MS_ACTIVE;
626 } 845 }
627 846
628 root = get_default_root(s, subvol_objectid);
629 if (IS_ERR(root)) {
630 error = PTR_ERR(root);
631 deactivate_locked_super(s);
632 goto error;
633 }
634 /* if they gave us a subvolume name bind mount into that */ 847 /* if they gave us a subvolume name bind mount into that */
635 if (strcmp(subvol_name, ".")) { 848 if (strcmp(subvol_name, ".")) {
636 struct dentry *new_root; 849 struct dentry *new_root;
850
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
637 mutex_lock(&root->d_inode->i_mutex); 858 mutex_lock(&root->d_inode->i_mutex);
638 new_root = lookup_one_len(subvol_name, root, 859 new_root = lookup_one_len(subvol_name, root,
639 strlen(subvol_name)); 860 strlen(subvol_name));
640 mutex_unlock(&root->d_inode->i_mutex); 861 mutex_unlock(&root->d_inode->i_mutex);
641 862
642 if (IS_ERR(new_root)) { 863 if (IS_ERR(new_root)) {
864 dput(root);
643 deactivate_locked_super(s); 865 deactivate_locked_super(s);
644 error = PTR_ERR(new_root); 866 error = PTR_ERR(new_root);
645 dput(root); 867 goto error_free_subvol_name;
646 goto error_close_devices;
647 } 868 }
648 if (!new_root->d_inode) { 869 if (!new_root->d_inode) {
649 dput(root); 870 dput(root);
650 dput(new_root); 871 dput(new_root);
651 deactivate_locked_super(s); 872 deactivate_locked_super(s);
652 error = -ENXIO; 873 error = -ENXIO;
653 goto error_close_devices; 874 goto error_free_subvol_name;
654 } 875 }
655 dput(root); 876 dput(root);
656 root = new_root; 877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
657 } 885 }
658 886
659 mnt->mnt_sb = s;
660 mnt->mnt_root = root;
661
662 kfree(subvol_name); 887 kfree(subvol_name);
663 return 0; 888 return root;
664 889
665error_s: 890error_s:
666 error = PTR_ERR(s); 891 error = PTR_ERR(s);
667error_close_devices: 892error_close_devices:
668 btrfs_close_devices(fs_devices); 893 btrfs_close_devices(fs_devices);
894 kfree(fs_info);
895 kfree(tree_root);
669error_free_subvol_name: 896error_free_subvol_name:
670 kfree(subvol_name); 897 kfree(subvol_name);
671error: 898 return ERR_PTR(error);
672 return error;
673} 899}
674 900
675static int btrfs_remount(struct super_block *sb, int *flags, char *data) 901static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -709,6 +935,153 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
709 return 0; 935 return 0;
710} 936}
711 937
938/* Used to sort the devices by max_avail(descending sort) */
939static int btrfs_cmp_device_free_bytes(const void *dev_info1,
940 const void *dev_info2)
941{
942 if (((struct btrfs_device_info *)dev_info1)->max_avail >
943 ((struct btrfs_device_info *)dev_info2)->max_avail)
944 return -1;
945 else if (((struct btrfs_device_info *)dev_info1)->max_avail <
946 ((struct btrfs_device_info *)dev_info2)->max_avail)
947 return 1;
948 else
949 return 0;
950}
951
952/*
953 * sort the devices by max_avail, in which max free extent size of each device
954 * is stored.(Descending Sort)
955 */
956static inline void btrfs_descending_sort_devices(
957 struct btrfs_device_info *devices,
958 size_t nr_devices)
959{
960 sort(devices, nr_devices, sizeof(struct btrfs_device_info),
961 btrfs_cmp_device_free_bytes, NULL);
962}
963
964/*
965 * The helper to calc the free space on the devices that can be used to store
966 * file data.
967 */
968static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
969{
970 struct btrfs_fs_info *fs_info = root->fs_info;
971 struct btrfs_device_info *devices_info;
972 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
973 struct btrfs_device *device;
974 u64 skip_space;
975 u64 type;
976 u64 avail_space;
977 u64 used_space;
978 u64 min_stripe_size;
979 int min_stripes = 1;
980 int i = 0, nr_devices;
981 int ret;
982
983 nr_devices = fs_info->fs_devices->rw_devices;
984 BUG_ON(!nr_devices);
985
986 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
987 GFP_NOFS);
988 if (!devices_info)
989 return -ENOMEM;
990
991 /* calc min stripe number for data space alloction */
992 type = btrfs_get_alloc_profile(root, 1);
993 if (type & BTRFS_BLOCK_GROUP_RAID0)
994 min_stripes = 2;
995 else if (type & BTRFS_BLOCK_GROUP_RAID1)
996 min_stripes = 2;
997 else if (type & BTRFS_BLOCK_GROUP_RAID10)
998 min_stripes = 4;
999
1000 if (type & BTRFS_BLOCK_GROUP_DUP)
1001 min_stripe_size = 2 * BTRFS_STRIPE_LEN;
1002 else
1003 min_stripe_size = BTRFS_STRIPE_LEN;
1004
1005 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
1006 if (!device->in_fs_metadata)
1007 continue;
1008
1009 avail_space = device->total_bytes - device->bytes_used;
1010
1011 /* align with stripe_len */
1012 do_div(avail_space, BTRFS_STRIPE_LEN);
1013 avail_space *= BTRFS_STRIPE_LEN;
1014
1015 /*
1016 * In order to avoid overwritting the superblock on the drive,
1017 * btrfs starts at an offset of at least 1MB when doing chunk
1018 * allocation.
1019 */
1020 skip_space = 1024 * 1024;
1021
1022 /* user can set the offset in fs_info->alloc_start. */
1023 if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
1024 device->total_bytes)
1025 skip_space = max(fs_info->alloc_start, skip_space);
1026
1027 /*
1028 * btrfs can not use the free space in [0, skip_space - 1],
1029 * we must subtract it from the total. In order to implement
1030 * it, we account the used space in this range first.
1031 */
1032 ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
1033 &used_space);
1034 if (ret) {
1035 kfree(devices_info);
1036 return ret;
1037 }
1038
1039 /* calc the free space in [0, skip_space - 1] */
1040 skip_space -= used_space;
1041
1042 /*
1043 * we can use the free space in [0, skip_space - 1], subtract
1044 * it from the total.
1045 */
1046 if (avail_space && avail_space >= skip_space)
1047 avail_space -= skip_space;
1048 else
1049 avail_space = 0;
1050
1051 if (avail_space < min_stripe_size)
1052 continue;
1053
1054 devices_info[i].dev = device;
1055 devices_info[i].max_avail = avail_space;
1056
1057 i++;
1058 }
1059
1060 nr_devices = i;
1061
1062 btrfs_descending_sort_devices(devices_info, nr_devices);
1063
1064 i = nr_devices - 1;
1065 avail_space = 0;
1066 while (nr_devices >= min_stripes) {
1067 if (devices_info[i].max_avail >= min_stripe_size) {
1068 int j;
1069 u64 alloc_size;
1070
1071 avail_space += devices_info[i].max_avail * min_stripes;
1072 alloc_size = devices_info[i].max_avail;
1073 for (j = i + 1 - min_stripes; j <= i; j++)
1074 devices_info[j].max_avail -= alloc_size;
1075 }
1076 i--;
1077 nr_devices--;
1078 }
1079
1080 kfree(devices_info);
1081 *free_bytes = avail_space;
1082 return 0;
1083}
1084
712static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
713{ 1086{
714 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -716,20 +1089,39 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
716 struct list_head *head = &root->fs_info->space_info; 1089 struct list_head *head = &root->fs_info->space_info;
717 struct btrfs_space_info *found; 1090 struct btrfs_space_info *found;
718 u64 total_used = 0; 1091 u64 total_used = 0;
1092 u64 total_free_data = 0;
719 int bits = dentry->d_sb->s_blocksize_bits; 1093 int bits = dentry->d_sb->s_blocksize_bits;
720 __be32 *fsid = (__be32 *)root->fs_info->fsid; 1094 __be32 *fsid = (__be32 *)root->fs_info->fsid;
1095 int ret;
721 1096
1097 /* holding chunk_muext to avoid allocating new chunks */
1098 mutex_lock(&root->fs_info->chunk_mutex);
722 rcu_read_lock(); 1099 rcu_read_lock();
723 list_for_each_entry_rcu(found, head, list) 1100 list_for_each_entry_rcu(found, head, list) {
1101 if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
1102 total_free_data += found->disk_total - found->disk_used;
1103 total_free_data -=
1104 btrfs_account_ro_block_groups_free_space(found);
1105 }
1106
724 total_used += found->disk_used; 1107 total_used += found->disk_used;
1108 }
725 rcu_read_unlock(); 1109 rcu_read_unlock();
726 1110
727 buf->f_namelen = BTRFS_NAME_LEN; 1111 buf->f_namelen = BTRFS_NAME_LEN;
728 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 1112 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
729 buf->f_bfree = buf->f_blocks - (total_used >> bits); 1113 buf->f_bfree = buf->f_blocks - (total_used >> bits);
730 buf->f_bavail = buf->f_bfree;
731 buf->f_bsize = dentry->d_sb->s_blocksize; 1114 buf->f_bsize = dentry->d_sb->s_blocksize;
732 buf->f_type = BTRFS_SUPER_MAGIC; 1115 buf->f_type = BTRFS_SUPER_MAGIC;
1116 buf->f_bavail = total_free_data;
1117 ret = btrfs_calc_avail_data_space(root, &total_free_data);
1118 if (ret) {
1119 mutex_unlock(&root->fs_info->chunk_mutex);
1120 return ret;
1121 }
1122 buf->f_bavail += total_free_data;
1123 buf->f_bavail = buf->f_bavail >> bits;
1124 mutex_unlock(&root->fs_info->chunk_mutex);
733 1125
734 /* We treat it as constant endianness (it doesn't matter _which_) 1126 /* We treat it as constant endianness (it doesn't matter _which_)
735 because we want the fsid to come out the same whether mounted 1127 because we want the fsid to come out the same whether mounted
@@ -746,7 +1138,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
746static struct file_system_type btrfs_fs_type = { 1138static struct file_system_type btrfs_fs_type = {
747 .owner = THIS_MODULE, 1139 .owner = THIS_MODULE,
748 .name = "btrfs", 1140 .name = "btrfs",
749 .get_sb = btrfs_get_sb, 1141 .mount = btrfs_mount,
750 .kill_sb = kill_anon_super, 1142 .kill_sb = kill_anon_super,
751 .fs_flags = FS_REQUIRES_DEV, 1143 .fs_flags = FS_REQUIRES_DEV,
752}; 1144};
@@ -815,6 +1207,7 @@ static const struct file_operations btrfs_ctl_fops = {
815 .unlocked_ioctl = btrfs_control_ioctl, 1207 .unlocked_ioctl = btrfs_control_ioctl,
816 .compat_ioctl = btrfs_control_ioctl, 1208 .compat_ioctl = btrfs_control_ioctl,
817 .owner = THIS_MODULE, 1209 .owner = THIS_MODULE,
1210 .llseek = noop_llseek,
818}; 1211};
819 1212
820static struct miscdevice btrfs_misc = { 1213static struct miscdevice btrfs_misc = {
@@ -845,10 +1238,14 @@ static int __init init_btrfs_fs(void)
845 if (err) 1238 if (err)
846 return err; 1239 return err;
847 1240
848 err = btrfs_init_cachep(); 1241 err = btrfs_init_compress();
849 if (err) 1242 if (err)
850 goto free_sysfs; 1243 goto free_sysfs;
851 1244
1245 err = btrfs_init_cachep();
1246 if (err)
1247 goto free_compress;
1248
852 err = extent_io_init(); 1249 err = extent_io_init();
853 if (err) 1250 if (err)
854 goto free_cachep; 1251 goto free_cachep;
@@ -857,10 +1254,14 @@ static int __init init_btrfs_fs(void)
857 if (err) 1254 if (err)
858 goto free_extent_io; 1255 goto free_extent_io;
859 1256
860 err = btrfs_interface_init(); 1257 err = btrfs_delayed_inode_init();
861 if (err) 1258 if (err)
862 goto free_extent_map; 1259 goto free_extent_map;
863 1260
1261 err = btrfs_interface_init();
1262 if (err)
1263 goto free_delayed_inode;
1264
864 err = register_filesystem(&btrfs_fs_type); 1265 err = register_filesystem(&btrfs_fs_type);
865 if (err) 1266 if (err)
866 goto unregister_ioctl; 1267 goto unregister_ioctl;
@@ -870,12 +1271,16 @@ static int __init init_btrfs_fs(void)
870 1271
871unregister_ioctl: 1272unregister_ioctl:
872 btrfs_interface_exit(); 1273 btrfs_interface_exit();
1274free_delayed_inode:
1275 btrfs_delayed_inode_exit();
873free_extent_map: 1276free_extent_map:
874 extent_map_exit(); 1277 extent_map_exit();
875free_extent_io: 1278free_extent_io:
876 extent_io_exit(); 1279 extent_io_exit();
877free_cachep: 1280free_cachep:
878 btrfs_destroy_cachep(); 1281 btrfs_destroy_cachep();
1282free_compress:
1283 btrfs_exit_compress();
879free_sysfs: 1284free_sysfs:
880 btrfs_exit_sysfs(); 1285 btrfs_exit_sysfs();
881 return err; 1286 return err;
@@ -884,13 +1289,14 @@ free_sysfs:
884static void __exit exit_btrfs_fs(void) 1289static void __exit exit_btrfs_fs(void)
885{ 1290{
886 btrfs_destroy_cachep(); 1291 btrfs_destroy_cachep();
1292 btrfs_delayed_inode_exit();
887 extent_map_exit(); 1293 extent_map_exit();
888 extent_io_exit(); 1294 extent_io_exit();
889 btrfs_interface_exit(); 1295 btrfs_interface_exit();
890 unregister_filesystem(&btrfs_fs_type); 1296 unregister_filesystem(&btrfs_fs_type);
891 btrfs_exit_sysfs(); 1297 btrfs_exit_sysfs();
892 btrfs_cleanup_fs_uuids(); 1298 btrfs_cleanup_fs_uuids();
893 btrfs_zlib_exit(); 1299 btrfs_exit_compress();
894} 1300}
895 1301
896module_init(init_btrfs_fs) 1302module_init(init_btrfs_fs)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4ce16ef702a3..daac9ae6d731 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -28,232 +28,9 @@
28#include "disk-io.h" 28#include "disk-io.h"
29#include "transaction.h" 29#include "transaction.h"
30 30
31static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
32{
33 return snprintf(buf, PAGE_SIZE, "%llu\n",
34 (unsigned long long)btrfs_root_used(&root->root_item));
35}
36
37static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
38{
39 return snprintf(buf, PAGE_SIZE, "%llu\n",
40 (unsigned long long)btrfs_root_limit(&root->root_item));
41}
42
43static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
44{
45
46 return snprintf(buf, PAGE_SIZE, "%llu\n",
47 (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
48}
49
50static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
51{
52 return snprintf(buf, PAGE_SIZE, "%llu\n",
53 (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
54}
55
56static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
57{
58 return snprintf(buf, PAGE_SIZE, "%llu\n",
59 (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
60}
61
62/* this is for root attrs (subvols/snapshots) */
63struct btrfs_root_attr {
64 struct attribute attr;
65 ssize_t (*show)(struct btrfs_root *, char *);
66 ssize_t (*store)(struct btrfs_root *, const char *, size_t);
67};
68
69#define ROOT_ATTR(name, mode, show, store) \
70static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
71 show, store)
72
73ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL);
74ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL);
75
76static struct attribute *btrfs_root_attrs[] = {
77 &btrfs_root_attr_blocks_used.attr,
78 &btrfs_root_attr_block_limit.attr,
79 NULL,
80};
81
82/* this is for super attrs (actual full fs) */
83struct btrfs_super_attr {
84 struct attribute attr;
85 ssize_t (*show)(struct btrfs_fs_info *, char *);
86 ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
87};
88
89#define SUPER_ATTR(name, mode, show, store) \
90static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
91 show, store)
92
93SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL);
94SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL);
95SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL);
96
97static struct attribute *btrfs_super_attrs[] = {
98 &btrfs_super_attr_blocks_used.attr,
99 &btrfs_super_attr_total_blocks.attr,
100 &btrfs_super_attr_blocksize.attr,
101 NULL,
102};
103
104static ssize_t btrfs_super_attr_show(struct kobject *kobj,
105 struct attribute *attr, char *buf)
106{
107 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
108 super_kobj);
109 struct btrfs_super_attr *a = container_of(attr,
110 struct btrfs_super_attr,
111 attr);
112
113 return a->show ? a->show(fs, buf) : 0;
114}
115
116static ssize_t btrfs_super_attr_store(struct kobject *kobj,
117 struct attribute *attr,
118 const char *buf, size_t len)
119{
120 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
121 super_kobj);
122 struct btrfs_super_attr *a = container_of(attr,
123 struct btrfs_super_attr,
124 attr);
125
126 return a->store ? a->store(fs, buf, len) : 0;
127}
128
129static ssize_t btrfs_root_attr_show(struct kobject *kobj,
130 struct attribute *attr, char *buf)
131{
132 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
133 root_kobj);
134 struct btrfs_root_attr *a = container_of(attr,
135 struct btrfs_root_attr,
136 attr);
137
138 return a->show ? a->show(root, buf) : 0;
139}
140
141static ssize_t btrfs_root_attr_store(struct kobject *kobj,
142 struct attribute *attr,
143 const char *buf, size_t len)
144{
145 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
146 root_kobj);
147 struct btrfs_root_attr *a = container_of(attr,
148 struct btrfs_root_attr,
149 attr);
150 return a->store ? a->store(root, buf, len) : 0;
151}
152
153static void btrfs_super_release(struct kobject *kobj)
154{
155 struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
156 super_kobj);
157 complete(&fs->kobj_unregister);
158}
159
160static void btrfs_root_release(struct kobject *kobj)
161{
162 struct btrfs_root *root = container_of(kobj, struct btrfs_root,
163 root_kobj);
164 complete(&root->kobj_unregister);
165}
166
167static const struct sysfs_ops btrfs_super_attr_ops = {
168 .show = btrfs_super_attr_show,
169 .store = btrfs_super_attr_store,
170};
171
172static const struct sysfs_ops btrfs_root_attr_ops = {
173 .show = btrfs_root_attr_show,
174 .store = btrfs_root_attr_store,
175};
176
177static struct kobj_type btrfs_root_ktype = {
178 .default_attrs = btrfs_root_attrs,
179 .sysfs_ops = &btrfs_root_attr_ops,
180 .release = btrfs_root_release,
181};
182
183static struct kobj_type btrfs_super_ktype = {
184 .default_attrs = btrfs_super_attrs,
185 .sysfs_ops = &btrfs_super_attr_ops,
186 .release = btrfs_super_release,
187};
188
189/* /sys/fs/btrfs/ entry */ 31/* /sys/fs/btrfs/ entry */
190static struct kset *btrfs_kset; 32static struct kset *btrfs_kset;
191 33
192int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
193{
194 int error;
195 char *name;
196 char c;
197 int len = strlen(fs->sb->s_id) + 1;
198 int i;
199
200 name = kmalloc(len, GFP_NOFS);
201 if (!name) {
202 error = -ENOMEM;
203 goto fail;
204 }
205
206 for (i = 0; i < len; i++) {
207 c = fs->sb->s_id[i];
208 if (c == '/' || c == '\\')
209 c = '!';
210 name[i] = c;
211 }
212 name[len] = '\0';
213
214 fs->super_kobj.kset = btrfs_kset;
215 error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
216 NULL, "%s", name);
217 kfree(name);
218 if (error)
219 goto fail;
220
221 return 0;
222
223fail:
224 printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
225 return error;
226}
227
228int btrfs_sysfs_add_root(struct btrfs_root *root)
229{
230 int error;
231
232 error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
233 &root->fs_info->super_kobj,
234 "%s", root->name);
235 if (error)
236 goto fail;
237
238 return 0;
239
240fail:
241 printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
242 return error;
243}
244
245void btrfs_sysfs_del_root(struct btrfs_root *root)
246{
247 kobject_put(&root->root_kobj);
248 wait_for_completion(&root->kobj_unregister);
249}
250
251void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
252{
253 kobject_put(&fs->super_kobj);
254 wait_for_completion(&fs->kobj_unregister);
255}
256
257int btrfs_init_sysfs(void) 34int btrfs_init_sysfs(void)
258{ 35{
259 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); 36 btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63b..51dcec86757f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -27,15 +27,15 @@
27#include "transaction.h" 27#include "transaction.h"
28#include "locking.h" 28#include "locking.h"
29#include "tree-log.h" 29#include "tree-log.h"
30#include "inode-map.h"
30 31
31#define BTRFS_ROOT_TRANS_TAG 0 32#define BTRFS_ROOT_TRANS_TAG 0
32 33
33static noinline void put_transaction(struct btrfs_transaction *transaction) 34static noinline void put_transaction(struct btrfs_transaction *transaction)
34{ 35{
35 WARN_ON(transaction->use_count == 0); 36 WARN_ON(atomic_read(&transaction->use_count) == 0);
36 transaction->use_count--; 37 if (atomic_dec_and_test(&transaction->use_count)) {
37 if (transaction->use_count == 0) { 38 BUG_ON(!list_empty(&transaction->list));
38 list_del_init(&transaction->list);
39 memset(transaction, 0, sizeof(*transaction)); 39 memset(transaction, 0, sizeof(*transaction));
40 kmem_cache_free(btrfs_transaction_cachep, transaction); 40 kmem_cache_free(btrfs_transaction_cachep, transaction);
41 } 41 }
@@ -50,46 +50,72 @@ static noinline void switch_commit_root(struct btrfs_root *root)
50/* 50/*
51 * either allocate a new transaction or hop into the existing one 51 * either allocate a new transaction or hop into the existing one
52 */ 52 */
53static noinline int join_transaction(struct btrfs_root *root) 53static noinline int join_transaction(struct btrfs_root *root, int nofail)
54{ 54{
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56
57 spin_lock(&root->fs_info->trans_lock);
58 if (root->fs_info->trans_no_join) {
59 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock);
61 return -EBUSY;
62 }
63 }
64
56 cur_trans = root->fs_info->running_transaction; 65 cur_trans = root->fs_info->running_transaction;
57 if (!cur_trans) { 66 if (cur_trans) {
58 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, 67 atomic_inc(&cur_trans->use_count);
59 GFP_NOFS); 68 atomic_inc(&cur_trans->num_writers);
60 BUG_ON(!cur_trans);
61 root->fs_info->generation++;
62 cur_trans->num_writers = 1;
63 cur_trans->num_joined = 0;
64 cur_trans->transid = root->fs_info->generation;
65 init_waitqueue_head(&cur_trans->writer_wait);
66 init_waitqueue_head(&cur_trans->commit_wait);
67 cur_trans->in_commit = 0;
68 cur_trans->blocked = 0;
69 cur_trans->use_count = 1;
70 cur_trans->commit_done = 0;
71 cur_trans->start_time = get_seconds();
72
73 cur_trans->delayed_refs.root = RB_ROOT;
74 cur_trans->delayed_refs.num_entries = 0;
75 cur_trans->delayed_refs.num_heads_ready = 0;
76 cur_trans->delayed_refs.num_heads = 0;
77 cur_trans->delayed_refs.flushing = 0;
78 cur_trans->delayed_refs.run_delayed_start = 0;
79 spin_lock_init(&cur_trans->delayed_refs.lock);
80
81 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
82 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
83 extent_io_tree_init(&cur_trans->dirty_pages,
84 root->fs_info->btree_inode->i_mapping,
85 GFP_NOFS);
86 spin_lock(&root->fs_info->new_trans_lock);
87 root->fs_info->running_transaction = cur_trans;
88 spin_unlock(&root->fs_info->new_trans_lock);
89 } else {
90 cur_trans->num_writers++;
91 cur_trans->num_joined++; 69 cur_trans->num_joined++;
70 spin_unlock(&root->fs_info->trans_lock);
71 return 0;
92 } 72 }
73 spin_unlock(&root->fs_info->trans_lock);
74
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans)
77 return -ENOMEM;
78 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) {
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count);
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 }
88 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait);
91 init_waitqueue_head(&cur_trans->commit_wait);
92 cur_trans->in_commit = 0;
93 cur_trans->blocked = 0;
94 /*
95 * One for this trans handle, one so it will live on until we
96 * commit the transaction.
97 */
98 atomic_set(&cur_trans->use_count, 2);
99 cur_trans->commit_done = 0;
100 cur_trans->start_time = get_seconds();
101
102 cur_trans->delayed_refs.root = RB_ROOT;
103 cur_trans->delayed_refs.num_entries = 0;
104 cur_trans->delayed_refs.num_heads_ready = 0;
105 cur_trans->delayed_refs.num_heads = 0;
106 cur_trans->delayed_refs.flushing = 0;
107 cur_trans->delayed_refs.run_delayed_start = 0;
108 spin_lock_init(&cur_trans->commit_lock);
109 spin_lock_init(&cur_trans->delayed_refs.lock);
110
111 INIT_LIST_HEAD(&cur_trans->pending_snapshots);
112 list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
113 extent_io_tree_init(&cur_trans->dirty_pages,
114 root->fs_info->btree_inode->i_mapping);
115 root->fs_info->generation++;
116 cur_trans->transid = root->fs_info->generation;
117 root->fs_info->running_transaction = cur_trans;
118 spin_unlock(&root->fs_info->trans_lock);
93 119
94 return 0; 120 return 0;
95} 121}
@@ -100,36 +126,82 @@ static noinline int join_transaction(struct btrfs_root *root)
100 * to make sure the old root from before we joined the transaction is deleted 126 * to make sure the old root from before we joined the transaction is deleted
101 * when the transaction commits 127 * when the transaction commits
102 */ 128 */
103static noinline int record_root_in_trans(struct btrfs_trans_handle *trans, 129static int record_root_in_trans(struct btrfs_trans_handle *trans,
104 struct btrfs_root *root) 130 struct btrfs_root *root)
105{ 131{
106 if (root->ref_cows && root->last_trans < trans->transid) { 132 if (root->ref_cows && root->last_trans < trans->transid) {
107 WARN_ON(root == root->fs_info->extent_root); 133 WARN_ON(root == root->fs_info->extent_root);
108 WARN_ON(root->commit_root != root->node); 134 WARN_ON(root->commit_root != root->node);
109 135
136 /*
137 * see below for in_trans_setup usage rules
138 * we have the reloc mutex held now, so there
139 * is only one writer in this function
140 */
141 root->in_trans_setup = 1;
142
143 /* make sure readers find in_trans_setup before
144 * they find our root->last_trans update
145 */
146 smp_wmb();
147
148 spin_lock(&root->fs_info->fs_roots_radix_lock);
149 if (root->last_trans == trans->transid) {
150 spin_unlock(&root->fs_info->fs_roots_radix_lock);
151 return 0;
152 }
110 radix_tree_tag_set(&root->fs_info->fs_roots_radix, 153 radix_tree_tag_set(&root->fs_info->fs_roots_radix,
111 (unsigned long)root->root_key.objectid, 154 (unsigned long)root->root_key.objectid,
112 BTRFS_ROOT_TRANS_TAG); 155 BTRFS_ROOT_TRANS_TAG);
156 spin_unlock(&root->fs_info->fs_roots_radix_lock);
113 root->last_trans = trans->transid; 157 root->last_trans = trans->transid;
158
159 /* this is pretty tricky. We don't want to
160 * take the relocation lock in btrfs_record_root_in_trans
161 * unless we're really doing the first setup for this root in
162 * this transaction.
163 *
164 * Normally we'd use root->last_trans as a flag to decide
165 * if we want to take the expensive mutex.
166 *
167 * But, we have to set root->last_trans before we
168 * init the relocation root, otherwise, we trip over warnings
169 * in ctree.c. The solution used here is to flag ourselves
170 * with root->in_trans_setup. When this is 1, we're still
171 * fixing up the reloc trees and everyone must wait.
172 *
173 * When this is zero, they can trust root->last_trans and fly
174 * through btrfs_record_root_in_trans without having to take the
175 * lock. smp_wmb() makes sure that all the writes above are
176 * done before we pop in the zero below
177 */
114 btrfs_init_reloc_root(trans, root); 178 btrfs_init_reloc_root(trans, root);
179 smp_wmb();
180 root->in_trans_setup = 0;
115 } 181 }
116 return 0; 182 return 0;
117} 183}
118 184
185
119int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, 186int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
120 struct btrfs_root *root) 187 struct btrfs_root *root)
121{ 188{
122 if (!root->ref_cows) 189 if (!root->ref_cows)
123 return 0; 190 return 0;
124 191
125 mutex_lock(&root->fs_info->trans_mutex); 192 /*
126 if (root->last_trans == trans->transid) { 193 * see record_root_in_trans for comments about in_trans_setup usage
127 mutex_unlock(&root->fs_info->trans_mutex); 194 * and barriers
195 */
196 smp_rmb();
197 if (root->last_trans == trans->transid &&
198 !root->in_trans_setup)
128 return 0; 199 return 0;
129 }
130 200
201 mutex_lock(&root->fs_info->reloc_mutex);
131 record_root_in_trans(trans, root); 202 record_root_in_trans(trans, root);
132 mutex_unlock(&root->fs_info->trans_mutex); 203 mutex_unlock(&root->fs_info->reloc_mutex);
204
133 return 0; 205 return 0;
134} 206}
135 207
@@ -141,21 +213,23 @@ static void wait_current_trans(struct btrfs_root *root)
141{ 213{
142 struct btrfs_transaction *cur_trans; 214 struct btrfs_transaction *cur_trans;
143 215
216 spin_lock(&root->fs_info->trans_lock);
144 cur_trans = root->fs_info->running_transaction; 217 cur_trans = root->fs_info->running_transaction;
145 if (cur_trans && cur_trans->blocked) { 218 if (cur_trans && cur_trans->blocked) {
146 DEFINE_WAIT(wait); 219 DEFINE_WAIT(wait);
147 cur_trans->use_count++; 220 atomic_inc(&cur_trans->use_count);
221 spin_unlock(&root->fs_info->trans_lock);
148 while (1) { 222 while (1) {
149 prepare_to_wait(&root->fs_info->transaction_wait, &wait, 223 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
150 TASK_UNINTERRUPTIBLE); 224 TASK_UNINTERRUPTIBLE);
151 if (!cur_trans->blocked) 225 if (!cur_trans->blocked)
152 break; 226 break;
153 mutex_unlock(&root->fs_info->trans_mutex);
154 schedule(); 227 schedule();
155 mutex_lock(&root->fs_info->trans_mutex);
156 } 228 }
157 finish_wait(&root->fs_info->transaction_wait, &wait); 229 finish_wait(&root->fs_info->transaction_wait, &wait);
158 put_transaction(cur_trans); 230 put_transaction(cur_trans);
231 } else {
232 spin_unlock(&root->fs_info->trans_lock);
159 } 233 }
160} 234}
161 235
@@ -163,14 +237,21 @@ enum btrfs_trans_type {
163 TRANS_START, 237 TRANS_START,
164 TRANS_JOIN, 238 TRANS_JOIN,
165 TRANS_USERSPACE, 239 TRANS_USERSPACE,
240 TRANS_JOIN_NOLOCK,
166}; 241};
167 242
168static int may_wait_transaction(struct btrfs_root *root, int type) 243static int may_wait_transaction(struct btrfs_root *root, int type)
169{ 244{
170 if (!root->fs_info->log_root_recovering && 245 if (root->fs_info->log_root_recovering)
171 ((type == TRANS_START && !root->fs_info->open_ioctl_trans) || 246 return 0;
172 type == TRANS_USERSPACE)) 247
248 if (type == TRANS_USERSPACE)
173 return 1; 249 return 1;
250
251 if (type == TRANS_START &&
252 !atomic_read(&root->fs_info->open_ioctl_trans))
253 return 1;
254
174 return 0; 255 return 0;
175} 256}
176 257
@@ -181,29 +262,47 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
181 struct btrfs_transaction *cur_trans; 262 struct btrfs_transaction *cur_trans;
182 int retries = 0; 263 int retries = 0;
183 int ret; 264 int ret;
265
266 if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
267 return ERR_PTR(-EROFS);
268
269 if (current->journal_info) {
270 WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
271 h = current->journal_info;
272 h->use_count++;
273 h->orig_rsv = h->block_rsv;
274 h->block_rsv = NULL;
275 goto got_it;
276 }
184again: 277again:
185 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); 278 h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
186 if (!h) 279 if (!h)
187 return ERR_PTR(-ENOMEM); 280 return ERR_PTR(-ENOMEM);
188 281
189 mutex_lock(&root->fs_info->trans_mutex);
190 if (may_wait_transaction(root, type)) 282 if (may_wait_transaction(root, type))
191 wait_current_trans(root); 283 wait_current_trans(root);
192 284
193 ret = join_transaction(root); 285 do {
194 BUG_ON(ret); 286 ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
287 if (ret == -EBUSY)
288 wait_current_trans(root);
289 } while (ret == -EBUSY);
290
291 if (ret < 0) {
292 kmem_cache_free(btrfs_trans_handle_cachep, h);
293 return ERR_PTR(ret);
294 }
195 295
196 cur_trans = root->fs_info->running_transaction; 296 cur_trans = root->fs_info->running_transaction;
197 cur_trans->use_count++;
198 mutex_unlock(&root->fs_info->trans_mutex);
199 297
200 h->transid = cur_trans->transid; 298 h->transid = cur_trans->transid;
201 h->transaction = cur_trans; 299 h->transaction = cur_trans;
202 h->blocks_used = 0; 300 h->blocks_used = 0;
203 h->block_group = 0;
204 h->bytes_reserved = 0; 301 h->bytes_reserved = 0;
205 h->delayed_ref_updates = 0; 302 h->delayed_ref_updates = 0;
303 h->use_count = 1;
206 h->block_rsv = NULL; 304 h->block_rsv = NULL;
305 h->orig_rsv = NULL;
207 306
208 smp_mb(); 307 smp_mb();
209 if (cur_trans->blocked && may_wait_transaction(root, type)) { 308 if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -212,21 +311,27 @@ again:
212 } 311 }
213 312
214 if (num_items > 0) { 313 if (num_items > 0) {
215 ret = btrfs_trans_reserve_metadata(h, root, num_items, 314 ret = btrfs_trans_reserve_metadata(h, root, num_items);
216 &retries); 315 if (ret == -EAGAIN && !retries) {
217 if (ret == -EAGAIN) { 316 retries++;
218 btrfs_commit_transaction(h, root); 317 btrfs_commit_transaction(h, root);
219 goto again; 318 goto again;
319 } else if (ret == -EAGAIN) {
320 /*
321 * We have already retried and got EAGAIN, so really we
322 * don't have space, so set ret to -ENOSPC.
323 */
324 ret = -ENOSPC;
220 } 325 }
326
221 if (ret < 0) { 327 if (ret < 0) {
222 btrfs_end_transaction(h, root); 328 btrfs_end_transaction(h, root);
223 return ERR_PTR(ret); 329 return ERR_PTR(ret);
224 } 330 }
225 } 331 }
226 332
227 mutex_lock(&root->fs_info->trans_mutex); 333got_it:
228 record_root_in_trans(h, root); 334 btrfs_record_root_in_trans(h, root);
229 mutex_unlock(&root->fs_info->trans_mutex);
230 335
231 if (!current->journal_info && type != TRANS_USERSPACE) 336 if (!current->journal_info && type != TRANS_USERSPACE)
232 current->journal_info = h; 337 current->journal_info = h;
@@ -238,16 +343,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
238{ 343{
239 return start_transaction(root, num_items, TRANS_START); 344 return start_transaction(root, num_items, TRANS_START);
240} 345}
241struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 346struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
242 int num_blocks)
243{ 347{
244 return start_transaction(root, 0, TRANS_JOIN); 348 return start_transaction(root, 0, TRANS_JOIN);
245} 349}
246 350
247struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 351struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
248 int num_blocks) 352{
353 return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
354}
355
356struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
249{ 357{
250 return start_transaction(r, 0, TRANS_USERSPACE); 358 return start_transaction(root, 0, TRANS_USERSPACE);
251} 359}
252 360
253/* wait for a transaction commit to be fully complete */ 361/* wait for a transaction commit to be fully complete */
@@ -255,70 +363,72 @@ static noinline int wait_for_commit(struct btrfs_root *root,
255 struct btrfs_transaction *commit) 363 struct btrfs_transaction *commit)
256{ 364{
257 DEFINE_WAIT(wait); 365 DEFINE_WAIT(wait);
258 mutex_lock(&root->fs_info->trans_mutex);
259 while (!commit->commit_done) { 366 while (!commit->commit_done) {
260 prepare_to_wait(&commit->commit_wait, &wait, 367 prepare_to_wait(&commit->commit_wait, &wait,
261 TASK_UNINTERRUPTIBLE); 368 TASK_UNINTERRUPTIBLE);
262 if (commit->commit_done) 369 if (commit->commit_done)
263 break; 370 break;
264 mutex_unlock(&root->fs_info->trans_mutex);
265 schedule(); 371 schedule();
266 mutex_lock(&root->fs_info->trans_mutex);
267 } 372 }
268 mutex_unlock(&root->fs_info->trans_mutex);
269 finish_wait(&commit->commit_wait, &wait); 373 finish_wait(&commit->commit_wait, &wait);
270 return 0; 374 return 0;
271} 375}
272 376
273#if 0 377int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
274/*
275 * rate limit against the drop_snapshot code. This helps to slow down new
276 * operations if the drop_snapshot code isn't able to keep up.
277 */
278static void throttle_on_drops(struct btrfs_root *root)
279{ 378{
280 struct btrfs_fs_info *info = root->fs_info; 379 struct btrfs_transaction *cur_trans = NULL, *t;
281 int harder_count = 0; 380 int ret;
282 381
283harder: 382 ret = 0;
284 if (atomic_read(&info->throttles)) { 383 if (transid) {
285 DEFINE_WAIT(wait); 384 if (transid <= root->fs_info->last_trans_committed)
286 int thr; 385 goto out;
287 thr = atomic_read(&info->throttle_gen); 386
288 387 /* find specified transaction */
289 do { 388 spin_lock(&root->fs_info->trans_lock);
290 prepare_to_wait(&info->transaction_throttle, 389 list_for_each_entry(t, &root->fs_info->trans_list, list) {
291 &wait, TASK_UNINTERRUPTIBLE); 390 if (t->transid == transid) {
292 if (!atomic_read(&info->throttles)) { 391 cur_trans = t;
293 finish_wait(&info->transaction_throttle, &wait); 392 atomic_inc(&cur_trans->use_count);
294 break; 393 break;
295 } 394 }
296 schedule(); 395 if (t->transid > transid)
297 finish_wait(&info->transaction_throttle, &wait); 396 break;
298 } while (thr == atomic_read(&info->throttle_gen)); 397 }
299 harder_count++; 398 spin_unlock(&root->fs_info->trans_lock);
300 399 ret = -EINVAL;
301 if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && 400 if (!cur_trans)
302 harder_count < 2) 401 goto out; /* bad transid */
303 goto harder; 402 } else {
403 /* find newest transaction that is committing | committed */
404 spin_lock(&root->fs_info->trans_lock);
405 list_for_each_entry_reverse(t, &root->fs_info->trans_list,
406 list) {
407 if (t->in_commit) {
408 if (t->commit_done)
409 break;
410 cur_trans = t;
411 atomic_inc(&cur_trans->use_count);
412 break;
413 }
414 }
415 spin_unlock(&root->fs_info->trans_lock);
416 if (!cur_trans)
417 goto out; /* nothing committing|committed */
418 }
304 419
305 if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && 420 wait_for_commit(root, cur_trans);
306 harder_count < 10)
307 goto harder;
308 421
309 if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && 422 put_transaction(cur_trans);
310 harder_count < 20) 423 ret = 0;
311 goto harder; 424out:
312 } 425 return ret;
313} 426}
314#endif
315 427
316void btrfs_throttle(struct btrfs_root *root) 428void btrfs_throttle(struct btrfs_root *root)
317{ 429{
318 mutex_lock(&root->fs_info->trans_mutex); 430 if (!atomic_read(&root->fs_info->open_ioctl_trans))
319 if (!root->fs_info->open_ioctl_trans)
320 wait_current_trans(root); 431 wait_current_trans(root);
321 mutex_unlock(&root->fs_info->trans_mutex);
322} 432}
323 433
324static int should_end_transaction(struct btrfs_trans_handle *trans, 434static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -336,6 +446,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
336 struct btrfs_transaction *cur_trans = trans->transaction; 446 struct btrfs_transaction *cur_trans = trans->transaction;
337 int updates; 447 int updates;
338 448
449 smp_mb();
339 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 450 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
340 return 1; 451 return 1;
341 452
@@ -348,12 +459,17 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
348} 459}
349 460
350static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, 461static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
351 struct btrfs_root *root, int throttle) 462 struct btrfs_root *root, int throttle, int lock)
352{ 463{
353 struct btrfs_transaction *cur_trans = trans->transaction; 464 struct btrfs_transaction *cur_trans = trans->transaction;
354 struct btrfs_fs_info *info = root->fs_info; 465 struct btrfs_fs_info *info = root->fs_info;
355 int count = 0; 466 int count = 0;
356 467
468 if (--trans->use_count) {
469 trans->block_rsv = trans->orig_rsv;
470 return 0;
471 }
472
357 while (count < 4) { 473 while (count < 4) {
358 unsigned long cur = trans->delayed_ref_updates; 474 unsigned long cur = trans->delayed_ref_updates;
359 trans->delayed_ref_updates = 0; 475 trans->delayed_ref_updates = 0;
@@ -376,26 +492,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
376 492
377 btrfs_trans_release_metadata(trans, root); 493 btrfs_trans_release_metadata(trans, root);
378 494
379 if (!root->fs_info->open_ioctl_trans && 495 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
380 should_end_transaction(trans, root)) 496 should_end_transaction(trans, root)) {
381 trans->transaction->blocked = 1; 497 trans->transaction->blocked = 1;
498 smp_wmb();
499 }
382 500
383 if (cur_trans->blocked && !cur_trans->in_commit) { 501 if (lock && cur_trans->blocked && !cur_trans->in_commit) {
384 if (throttle) 502 if (throttle)
385 return btrfs_commit_transaction(trans, root); 503 return btrfs_commit_transaction(trans, root);
386 else 504 else
387 wake_up_process(info->transaction_kthread); 505 wake_up_process(info->transaction_kthread);
388 } 506 }
389 507
390 mutex_lock(&info->trans_mutex);
391 WARN_ON(cur_trans != info->running_transaction); 508 WARN_ON(cur_trans != info->running_transaction);
392 WARN_ON(cur_trans->num_writers < 1); 509 WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
393 cur_trans->num_writers--; 510 atomic_dec(&cur_trans->num_writers);
394 511
512 smp_mb();
395 if (waitqueue_active(&cur_trans->writer_wait)) 513 if (waitqueue_active(&cur_trans->writer_wait))
396 wake_up(&cur_trans->writer_wait); 514 wake_up(&cur_trans->writer_wait);
397 put_transaction(cur_trans); 515 put_transaction(cur_trans);
398 mutex_unlock(&info->trans_mutex);
399 516
400 if (current->journal_info == trans) 517 if (current->journal_info == trans)
401 current->journal_info = NULL; 518 current->journal_info = NULL;
@@ -411,13 +528,40 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
411int btrfs_end_transaction(struct btrfs_trans_handle *trans, 528int btrfs_end_transaction(struct btrfs_trans_handle *trans,
412 struct btrfs_root *root) 529 struct btrfs_root *root)
413{ 530{
414 return __btrfs_end_transaction(trans, root, 0); 531 int ret;
532
533 ret = __btrfs_end_transaction(trans, root, 0, 1);
534 if (ret)
535 return ret;
536 return 0;
415} 537}
416 538
417int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 539int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 540 struct btrfs_root *root)
419{ 541{
420 return __btrfs_end_transaction(trans, root, 1); 542 int ret;
543
544 ret = __btrfs_end_transaction(trans, root, 1, 1);
545 if (ret)
546 return ret;
547 return 0;
548}
549
550int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
551 struct btrfs_root *root)
552{
553 int ret;
554
555 ret = __btrfs_end_transaction(trans, root, 0, 0);
556 if (ret)
557 return ret;
558 return 0;
559}
560
561int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562 struct btrfs_root *root)
563{
564 return __btrfs_end_transaction(trans, root, 1, 1);
421} 565}
422 566
423/* 567/*
@@ -643,9 +787,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
643 */ 787 */
644int btrfs_add_dead_root(struct btrfs_root *root) 788int btrfs_add_dead_root(struct btrfs_root *root)
645{ 789{
646 mutex_lock(&root->fs_info->trans_mutex); 790 spin_lock(&root->fs_info->trans_lock);
647 list_add(&root->root_list, &root->fs_info->dead_roots); 791 list_add(&root->root_list, &root->fs_info->dead_roots);
648 mutex_unlock(&root->fs_info->trans_mutex); 792 spin_unlock(&root->fs_info->trans_lock);
649 return 0; 793 return 0;
650} 794}
651 795
@@ -661,6 +805,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
661 int ret; 805 int ret;
662 int err = 0; 806 int err = 0;
663 807
808 spin_lock(&fs_info->fs_roots_radix_lock);
664 while (1) { 809 while (1) {
665 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, 810 ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
666 (void **)gang, 0, 811 (void **)gang, 0,
@@ -673,13 +818,20 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
673 radix_tree_tag_clear(&fs_info->fs_roots_radix, 818 radix_tree_tag_clear(&fs_info->fs_roots_radix,
674 (unsigned long)root->root_key.objectid, 819 (unsigned long)root->root_key.objectid,
675 BTRFS_ROOT_TRANS_TAG); 820 BTRFS_ROOT_TRANS_TAG);
821 spin_unlock(&fs_info->fs_roots_radix_lock);
676 822
677 btrfs_free_log(trans, root); 823 btrfs_free_log(trans, root);
678 btrfs_update_reloc_root(trans, root); 824 btrfs_update_reloc_root(trans, root);
679 btrfs_orphan_commit_root(trans, root); 825 btrfs_orphan_commit_root(trans, root);
680 826
827 btrfs_save_ino_cache(root, trans);
828
681 if (root->commit_root != root->node) { 829 if (root->commit_root != root->node) {
830 mutex_lock(&root->fs_commit_mutex);
682 switch_commit_root(root); 831 switch_commit_root(root);
832 btrfs_unpin_free_ino(root);
833 mutex_unlock(&root->fs_commit_mutex);
834
683 btrfs_set_root_node(&root->root_item, 835 btrfs_set_root_node(&root->root_item,
684 root->node); 836 root->node);
685 } 837 }
@@ -687,10 +839,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
687 err = btrfs_update_root(trans, fs_info->tree_root, 839 err = btrfs_update_root(trans, fs_info->tree_root,
688 &root->root_key, 840 &root->root_key,
689 &root->root_item); 841 &root->root_item);
842 spin_lock(&fs_info->fs_roots_radix_lock);
690 if (err) 843 if (err)
691 break; 844 break;
692 } 845 }
693 } 846 }
847 spin_unlock(&fs_info->fs_roots_radix_lock);
694 return err; 848 return err;
695} 849}
696 850
@@ -720,104 +874,13 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
720 btrfs_btree_balance_dirty(info->tree_root, nr); 874 btrfs_btree_balance_dirty(info->tree_root, nr);
721 cond_resched(); 875 cond_resched();
722 876
723 if (root->fs_info->closing || ret != -EAGAIN) 877 if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
724 break; 878 break;
725 } 879 }
726 root->defrag_running = 0; 880 root->defrag_running = 0;
727 return ret; 881 return ret;
728} 882}
729 883
730#if 0
731/*
732 * when dropping snapshots, we generate a ton of delayed refs, and it makes
733 * sense not to join the transaction while it is trying to flush the current
734 * queue of delayed refs out.
735 *
736 * This is used by the drop snapshot code only
737 */
738static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
739{
740 DEFINE_WAIT(wait);
741
742 mutex_lock(&info->trans_mutex);
743 while (info->running_transaction &&
744 info->running_transaction->delayed_refs.flushing) {
745 prepare_to_wait(&info->transaction_wait, &wait,
746 TASK_UNINTERRUPTIBLE);
747 mutex_unlock(&info->trans_mutex);
748
749 schedule();
750
751 mutex_lock(&info->trans_mutex);
752 finish_wait(&info->transaction_wait, &wait);
753 }
754 mutex_unlock(&info->trans_mutex);
755 return 0;
756}
757
758/*
759 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
760 * all of them
761 */
762int btrfs_drop_dead_root(struct btrfs_root *root)
763{
764 struct btrfs_trans_handle *trans;
765 struct btrfs_root *tree_root = root->fs_info->tree_root;
766 unsigned long nr;
767 int ret;
768
769 while (1) {
770 /*
771 * we don't want to jump in and create a bunch of
772 * delayed refs if the transaction is starting to close
773 */
774 wait_transaction_pre_flush(tree_root->fs_info);
775 trans = btrfs_start_transaction(tree_root, 1);
776
777 /*
778 * we've joined a transaction, make sure it isn't
779 * closing right now
780 */
781 if (trans->transaction->delayed_refs.flushing) {
782 btrfs_end_transaction(trans, tree_root);
783 continue;
784 }
785
786 ret = btrfs_drop_snapshot(trans, root);
787 if (ret != -EAGAIN)
788 break;
789
790 ret = btrfs_update_root(trans, tree_root,
791 &root->root_key,
792 &root->root_item);
793 if (ret)
794 break;
795
796 nr = trans->blocks_used;
797 ret = btrfs_end_transaction(trans, tree_root);
798 BUG_ON(ret);
799
800 btrfs_btree_balance_dirty(tree_root, nr);
801 cond_resched();
802 }
803 BUG_ON(ret);
804
805 ret = btrfs_del_root(trans, tree_root, &root->root_key);
806 BUG_ON(ret);
807
808 nr = trans->blocks_used;
809 ret = btrfs_end_transaction(trans, tree_root);
810 BUG_ON(ret);
811
812 free_extent_buffer(root->node);
813 free_extent_buffer(root->commit_root);
814 kfree(root);
815
816 btrfs_btree_balance_dirty(tree_root, nr);
817 return ret;
818}
819#endif
820
821/* 884/*
822 * new snapshots need to be created at a very specific time in the 885 * new snapshots need to be created at a very specific time in the
823 * transaction commit. This does the actual creation 886 * transaction commit. This does the actual creation
@@ -832,14 +895,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
832 struct btrfs_root *root = pending->root; 895 struct btrfs_root *root = pending->root;
833 struct btrfs_root *parent_root; 896 struct btrfs_root *parent_root;
834 struct inode *parent_inode; 897 struct inode *parent_inode;
898 struct dentry *parent;
835 struct dentry *dentry; 899 struct dentry *dentry;
836 struct extent_buffer *tmp; 900 struct extent_buffer *tmp;
837 struct extent_buffer *old; 901 struct extent_buffer *old;
838 int ret; 902 int ret;
839 int retries = 0;
840 u64 to_reserve = 0; 903 u64 to_reserve = 0;
841 u64 index = 0; 904 u64 index = 0;
842 u64 objectid; 905 u64 objectid;
906 u64 root_flags;
843 907
844 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); 908 new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
845 if (!new_root_item) { 909 if (!new_root_item) {
@@ -847,7 +911,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
847 goto fail; 911 goto fail;
848 } 912 }
849 913
850 ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); 914 ret = btrfs_find_free_objectid(tree_root, &objectid);
851 if (ret) { 915 if (ret) {
852 pending->error = ret; 916 pending->error = ret;
853 goto fail; 917 goto fail;
@@ -858,7 +922,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
858 922
859 if (to_reserve > 0) { 923 if (to_reserve > 0) {
860 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 924 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
861 to_reserve, &retries); 925 to_reserve);
862 if (ret) { 926 if (ret) {
863 pending->error = ret; 927 pending->error = ret;
864 goto fail; 928 goto fail;
@@ -872,7 +936,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
872 trans->block_rsv = &pending->block_rsv; 936 trans->block_rsv = &pending->block_rsv;
873 937
874 dentry = pending->dentry; 938 dentry = pending->dentry;
875 parent_inode = dentry->d_parent->d_inode; 939 parent = dget_parent(dentry);
940 parent_inode = parent->d_inode;
876 parent_root = BTRFS_I(parent_inode)->root; 941 parent_root = BTRFS_I(parent_inode)->root;
877 record_root_in_trans(trans, parent_root); 942 record_root_in_trans(trans, parent_root);
878 943
@@ -883,7 +948,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
883 BUG_ON(ret); 948 BUG_ON(ret);
884 ret = btrfs_insert_dir_item(trans, parent_root, 949 ret = btrfs_insert_dir_item(trans, parent_root,
885 dentry->d_name.name, dentry->d_name.len, 950 dentry->d_name.name, dentry->d_name.len,
886 parent_inode->i_ino, &key, 951 parent_inode, &key,
887 BTRFS_FT_DIR, index); 952 BTRFS_FT_DIR, index);
888 BUG_ON(ret); 953 BUG_ON(ret);
889 954
@@ -892,9 +957,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
892 ret = btrfs_update_inode(trans, parent_root, parent_inode); 957 ret = btrfs_update_inode(trans, parent_root, parent_inode);
893 BUG_ON(ret); 958 BUG_ON(ret);
894 959
960 /*
961 * pull in the delayed directory update
962 * and the delayed inode item
963 * otherwise we corrupt the FS during
964 * snapshot
965 */
966 ret = btrfs_run_delayed_items(trans, root);
967 BUG_ON(ret);
968
895 record_root_in_trans(trans, root); 969 record_root_in_trans(trans, root);
896 btrfs_set_root_last_snapshot(&root->root_item, trans->transid); 970 btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
897 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); 971 memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
972 btrfs_check_and_init_root_item(new_root_item);
973
974 root_flags = btrfs_root_flags(new_root_item);
975 if (pending->readonly)
976 root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
977 else
978 root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
979 btrfs_set_root_flags(new_root_item, root_flags);
898 980
899 old = btrfs_lock_root_node(root); 981 old = btrfs_lock_root_node(root);
900 btrfs_cow_block(trans, root, old, NULL, 0, &old); 982 btrfs_cow_block(trans, root, old, NULL, 0, &old);
@@ -917,9 +999,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
917 */ 999 */
918 ret = btrfs_add_root_ref(trans, tree_root, objectid, 1000 ret = btrfs_add_root_ref(trans, tree_root, objectid,
919 parent_root->root_key.objectid, 1001 parent_root->root_key.objectid,
920 parent_inode->i_ino, index, 1002 btrfs_ino(parent_inode), index,
921 dentry->d_name.name, dentry->d_name.len); 1003 dentry->d_name.name, dentry->d_name.len);
922 BUG_ON(ret); 1004 BUG_ON(ret);
1005 dput(parent);
923 1006
924 key.offset = (u64)-1; 1007 key.offset = (u64)-1;
925 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key); 1008 pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
@@ -966,33 +1049,152 @@ static void update_super_roots(struct btrfs_root *root)
966 super->root = root_item->bytenr; 1049 super->root = root_item->bytenr;
967 super->generation = root_item->generation; 1050 super->generation = root_item->generation;
968 super->root_level = root_item->level; 1051 super->root_level = root_item->level;
1052 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
1053 super->cache_generation = root_item->generation;
969} 1054}
970 1055
971int btrfs_transaction_in_commit(struct btrfs_fs_info *info) 1056int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
972{ 1057{
973 int ret = 0; 1058 int ret = 0;
974 spin_lock(&info->new_trans_lock); 1059 spin_lock(&info->trans_lock);
975 if (info->running_transaction) 1060 if (info->running_transaction)
976 ret = info->running_transaction->in_commit; 1061 ret = info->running_transaction->in_commit;
977 spin_unlock(&info->new_trans_lock); 1062 spin_unlock(&info->trans_lock);
978 return ret; 1063 return ret;
979} 1064}
980 1065
981int btrfs_transaction_blocked(struct btrfs_fs_info *info) 1066int btrfs_transaction_blocked(struct btrfs_fs_info *info)
982{ 1067{
983 int ret = 0; 1068 int ret = 0;
984 spin_lock(&info->new_trans_lock); 1069 spin_lock(&info->trans_lock);
985 if (info->running_transaction) 1070 if (info->running_transaction)
986 ret = info->running_transaction->blocked; 1071 ret = info->running_transaction->blocked;
987 spin_unlock(&info->new_trans_lock); 1072 spin_unlock(&info->trans_lock);
988 return ret; 1073 return ret;
989} 1074}
990 1075
1076/*
1077 * wait for the current transaction commit to start and block subsequent
1078 * transaction joins
1079 */
1080static void wait_current_trans_commit_start(struct btrfs_root *root,
1081 struct btrfs_transaction *trans)
1082{
1083 DEFINE_WAIT(wait);
1084
1085 if (trans->in_commit)
1086 return;
1087
1088 while (1) {
1089 prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
1090 TASK_UNINTERRUPTIBLE);
1091 if (trans->in_commit) {
1092 finish_wait(&root->fs_info->transaction_blocked_wait,
1093 &wait);
1094 break;
1095 }
1096 schedule();
1097 finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
1098 }
1099}
1100
1101/*
1102 * wait for the current transaction to start and then become unblocked.
1103 * caller holds ref.
1104 */
1105static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
1106 struct btrfs_transaction *trans)
1107{
1108 DEFINE_WAIT(wait);
1109
1110 if (trans->commit_done || (trans->in_commit && !trans->blocked))
1111 return;
1112
1113 while (1) {
1114 prepare_to_wait(&root->fs_info->transaction_wait, &wait,
1115 TASK_UNINTERRUPTIBLE);
1116 if (trans->commit_done ||
1117 (trans->in_commit && !trans->blocked)) {
1118 finish_wait(&root->fs_info->transaction_wait,
1119 &wait);
1120 break;
1121 }
1122 schedule();
1123 finish_wait(&root->fs_info->transaction_wait,
1124 &wait);
1125 }
1126}
1127
1128/*
1129 * commit transactions asynchronously. once btrfs_commit_transaction_async
1130 * returns, any subsequent transaction will not be allowed to join.
1131 */
1132struct btrfs_async_commit {
1133 struct btrfs_trans_handle *newtrans;
1134 struct btrfs_root *root;
1135 struct delayed_work work;
1136};
1137
1138static void do_async_commit(struct work_struct *work)
1139{
1140 struct btrfs_async_commit *ac =
1141 container_of(work, struct btrfs_async_commit, work.work);
1142
1143 btrfs_commit_transaction(ac->newtrans, ac->root);
1144 kfree(ac);
1145}
1146
1147int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
1148 struct btrfs_root *root,
1149 int wait_for_unblock)
1150{
1151 struct btrfs_async_commit *ac;
1152 struct btrfs_transaction *cur_trans;
1153
1154 ac = kmalloc(sizeof(*ac), GFP_NOFS);
1155 if (!ac)
1156 return -ENOMEM;
1157
1158 INIT_DELAYED_WORK(&ac->work, do_async_commit);
1159 ac->root = root;
1160 ac->newtrans = btrfs_join_transaction(root);
1161 if (IS_ERR(ac->newtrans)) {
1162 int err = PTR_ERR(ac->newtrans);
1163 kfree(ac);
1164 return err;
1165 }
1166
1167 /* take transaction reference */
1168 cur_trans = trans->transaction;
1169 atomic_inc(&cur_trans->use_count);
1170
1171 btrfs_end_transaction(trans, root);
1172 schedule_delayed_work(&ac->work, 0);
1173
1174 /* wait for transaction to start and unblock */
1175 if (wait_for_unblock)
1176 wait_current_trans_commit_start_and_unblock(root, cur_trans);
1177 else
1178 wait_current_trans_commit_start(root, cur_trans);
1179
1180 if (current->journal_info == trans)
1181 current->journal_info = NULL;
1182
1183 put_transaction(cur_trans);
1184 return 0;
1185}
1186
1187/*
1188 * btrfs_transaction state sequence:
1189 * in_commit = 0, blocked = 0 (initial)
1190 * in_commit = 1, blocked = 1
1191 * blocked = 0
1192 * commit_done = 1
1193 */
991int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 1194int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
992 struct btrfs_root *root) 1195 struct btrfs_root *root)
993{ 1196{
994 unsigned long joined = 0; 1197 unsigned long joined = 0;
995 unsigned long timeout = 1;
996 struct btrfs_transaction *cur_trans; 1198 struct btrfs_transaction *cur_trans;
997 struct btrfs_transaction *prev_trans = NULL; 1199 struct btrfs_transaction *prev_trans = NULL;
998 DEFINE_WAIT(wait); 1200 DEFINE_WAIT(wait);
@@ -1021,36 +1223,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1021 ret = btrfs_run_delayed_refs(trans, root, 0); 1223 ret = btrfs_run_delayed_refs(trans, root, 0);
1022 BUG_ON(ret); 1224 BUG_ON(ret);
1023 1225
1024 mutex_lock(&root->fs_info->trans_mutex); 1226 spin_lock(&cur_trans->commit_lock);
1025 if (cur_trans->in_commit) { 1227 if (cur_trans->in_commit) {
1026 cur_trans->use_count++; 1228 spin_unlock(&cur_trans->commit_lock);
1027 mutex_unlock(&root->fs_info->trans_mutex); 1229 atomic_inc(&cur_trans->use_count);
1028 btrfs_end_transaction(trans, root); 1230 btrfs_end_transaction(trans, root);
1029 1231
1030 ret = wait_for_commit(root, cur_trans); 1232 ret = wait_for_commit(root, cur_trans);
1031 BUG_ON(ret); 1233 BUG_ON(ret);
1032 1234
1033 mutex_lock(&root->fs_info->trans_mutex);
1034 put_transaction(cur_trans); 1235 put_transaction(cur_trans);
1035 mutex_unlock(&root->fs_info->trans_mutex);
1036 1236
1037 return 0; 1237 return 0;
1038 } 1238 }
1039 1239
1040 trans->transaction->in_commit = 1; 1240 trans->transaction->in_commit = 1;
1041 trans->transaction->blocked = 1; 1241 trans->transaction->blocked = 1;
1242 spin_unlock(&cur_trans->commit_lock);
1243 wake_up(&root->fs_info->transaction_blocked_wait);
1244
1245 spin_lock(&root->fs_info->trans_lock);
1042 if (cur_trans->list.prev != &root->fs_info->trans_list) { 1246 if (cur_trans->list.prev != &root->fs_info->trans_list) {
1043 prev_trans = list_entry(cur_trans->list.prev, 1247 prev_trans = list_entry(cur_trans->list.prev,
1044 struct btrfs_transaction, list); 1248 struct btrfs_transaction, list);
1045 if (!prev_trans->commit_done) { 1249 if (!prev_trans->commit_done) {
1046 prev_trans->use_count++; 1250 atomic_inc(&prev_trans->use_count);
1047 mutex_unlock(&root->fs_info->trans_mutex); 1251 spin_unlock(&root->fs_info->trans_lock);
1048 1252
1049 wait_for_commit(root, prev_trans); 1253 wait_for_commit(root, prev_trans);
1050 1254
1051 mutex_lock(&root->fs_info->trans_mutex);
1052 put_transaction(prev_trans); 1255 put_transaction(prev_trans);
1256 } else {
1257 spin_unlock(&root->fs_info->trans_lock);
1053 } 1258 }
1259 } else {
1260 spin_unlock(&root->fs_info->trans_lock);
1054 } 1261 }
1055 1262
1056 if (now < cur_trans->start_time || now - cur_trans->start_time < 1) 1263 if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1058,17 +1265,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1058 1265
1059 do { 1266 do {
1060 int snap_pending = 0; 1267 int snap_pending = 0;
1268
1061 joined = cur_trans->num_joined; 1269 joined = cur_trans->num_joined;
1062 if (!list_empty(&trans->transaction->pending_snapshots)) 1270 if (!list_empty(&trans->transaction->pending_snapshots))
1063 snap_pending = 1; 1271 snap_pending = 1;
1064 1272
1065 WARN_ON(cur_trans != trans->transaction); 1273 WARN_ON(cur_trans != trans->transaction);
1066 if (cur_trans->num_writers > 1)
1067 timeout = MAX_SCHEDULE_TIMEOUT;
1068 else if (should_grow)
1069 timeout = 1;
1070
1071 mutex_unlock(&root->fs_info->trans_mutex);
1072 1274
1073 if (flush_on_commit || snap_pending) { 1275 if (flush_on_commit || snap_pending) {
1074 btrfs_start_delalloc_inodes(root, 1); 1276 btrfs_start_delalloc_inodes(root, 1);
@@ -1076,6 +1278,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1076 BUG_ON(ret); 1278 BUG_ON(ret);
1077 } 1279 }
1078 1280
1281 ret = btrfs_run_delayed_items(trans, root);
1282 BUG_ON(ret);
1283
1079 /* 1284 /*
1080 * rename don't use btrfs_join_transaction, so, once we 1285 * rename don't use btrfs_join_transaction, so, once we
1081 * set the transaction to blocked above, we aren't going 1286 * set the transaction to blocked above, we aren't going
@@ -1088,23 +1293,51 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1088 prepare_to_wait(&cur_trans->writer_wait, &wait, 1293 prepare_to_wait(&cur_trans->writer_wait, &wait,
1089 TASK_UNINTERRUPTIBLE); 1294 TASK_UNINTERRUPTIBLE);
1090 1295
1091 smp_mb(); 1296 if (atomic_read(&cur_trans->num_writers) > 1)
1092 if (cur_trans->num_writers > 1 || should_grow) 1297 schedule_timeout(MAX_SCHEDULE_TIMEOUT);
1093 schedule_timeout(timeout); 1298 else if (should_grow)
1299 schedule_timeout(1);
1094 1300
1095 mutex_lock(&root->fs_info->trans_mutex);
1096 finish_wait(&cur_trans->writer_wait, &wait); 1301 finish_wait(&cur_trans->writer_wait, &wait);
1097 } while (cur_trans->num_writers > 1 || 1302 } while (atomic_read(&cur_trans->num_writers) > 1 ||
1098 (should_grow && cur_trans->num_joined != joined)); 1303 (should_grow && cur_trans->num_joined != joined));
1099 1304
1305 /*
1306 * Ok now we need to make sure to block out any other joins while we
1307 * commit the transaction. We could have started a join before setting
1308 * no_join so make sure to wait for num_writers to == 1 again.
1309 */
1310 spin_lock(&root->fs_info->trans_lock);
1311 root->fs_info->trans_no_join = 1;
1312 spin_unlock(&root->fs_info->trans_lock);
1313 wait_event(cur_trans->writer_wait,
1314 atomic_read(&cur_trans->num_writers) == 1);
1315
1316 /*
1317 * the reloc mutex makes sure that we stop
1318 * the balancing code from coming in and moving
1319 * extents around in the middle of the commit
1320 */
1321 mutex_lock(&root->fs_info->reloc_mutex);
1322
1323 ret = btrfs_run_delayed_items(trans, root);
1324 BUG_ON(ret);
1325
1100 ret = create_pending_snapshots(trans, root->fs_info); 1326 ret = create_pending_snapshots(trans, root->fs_info);
1101 BUG_ON(ret); 1327 BUG_ON(ret);
1102 1328
1103 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); 1329 ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
1104 BUG_ON(ret); 1330 BUG_ON(ret);
1105 1331
1332 /*
1333 * make sure none of the code above managed to slip in a
1334 * delayed item
1335 */
1336 btrfs_assert_delayed_root_empty(root);
1337
1106 WARN_ON(cur_trans != trans->transaction); 1338 WARN_ON(cur_trans != trans->transaction);
1107 1339
1340 btrfs_scrub_pause(root);
1108 /* btrfs_commit_tree_roots is responsible for getting the 1341 /* btrfs_commit_tree_roots is responsible for getting the
1109 * various roots consistent with each other. Every pointer 1342 * various roots consistent with each other. Every pointer
1110 * in the tree of tree roots has to point to the most up to date 1343 * in the tree of tree roots has to point to the most up to date
@@ -1134,9 +1367,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1134 btrfs_prepare_extent_commit(trans, root); 1367 btrfs_prepare_extent_commit(trans, root);
1135 1368
1136 cur_trans = root->fs_info->running_transaction; 1369 cur_trans = root->fs_info->running_transaction;
1137 spin_lock(&root->fs_info->new_trans_lock);
1138 root->fs_info->running_transaction = NULL;
1139 spin_unlock(&root->fs_info->new_trans_lock);
1140 1370
1141 btrfs_set_root_node(&root->fs_info->tree_root->root_item, 1371 btrfs_set_root_node(&root->fs_info->tree_root->root_item,
1142 root->fs_info->tree_root->node); 1372 root->fs_info->tree_root->node);
@@ -1157,10 +1387,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1157 sizeof(root->fs_info->super_copy)); 1387 sizeof(root->fs_info->super_copy));
1158 1388
1159 trans->transaction->blocked = 0; 1389 trans->transaction->blocked = 0;
1390 spin_lock(&root->fs_info->trans_lock);
1391 root->fs_info->running_transaction = NULL;
1392 root->fs_info->trans_no_join = 0;
1393 spin_unlock(&root->fs_info->trans_lock);
1394 mutex_unlock(&root->fs_info->reloc_mutex);
1160 1395
1161 wake_up(&root->fs_info->transaction_wait); 1396 wake_up(&root->fs_info->transaction_wait);
1162 1397
1163 mutex_unlock(&root->fs_info->trans_mutex);
1164 ret = btrfs_write_and_wait_transaction(trans, root); 1398 ret = btrfs_write_and_wait_transaction(trans, root);
1165 BUG_ON(ret); 1399 BUG_ON(ret);
1166 write_ctree_super(trans, root, 0); 1400 write_ctree_super(trans, root, 0);
@@ -1173,18 +1407,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1173 1407
1174 btrfs_finish_extent_commit(trans, root); 1408 btrfs_finish_extent_commit(trans, root);
1175 1409
1176 mutex_lock(&root->fs_info->trans_mutex);
1177
1178 cur_trans->commit_done = 1; 1410 cur_trans->commit_done = 1;
1179 1411
1180 root->fs_info->last_trans_committed = cur_trans->transid; 1412 root->fs_info->last_trans_committed = cur_trans->transid;
1181 1413
1182 wake_up(&cur_trans->commit_wait); 1414 wake_up(&cur_trans->commit_wait);
1183 1415
1416 spin_lock(&root->fs_info->trans_lock);
1417 list_del_init(&cur_trans->list);
1418 spin_unlock(&root->fs_info->trans_lock);
1419
1184 put_transaction(cur_trans); 1420 put_transaction(cur_trans);
1185 put_transaction(cur_trans); 1421 put_transaction(cur_trans);
1186 1422
1187 mutex_unlock(&root->fs_info->trans_mutex); 1423 trace_btrfs_transaction_commit(root);
1424
1425 btrfs_scrub_continue(root);
1188 1426
1189 if (current->journal_info == trans) 1427 if (current->journal_info == trans)
1190 current->journal_info = NULL; 1428 current->journal_info = NULL;
@@ -1205,14 +1443,16 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
1205 LIST_HEAD(list); 1443 LIST_HEAD(list);
1206 struct btrfs_fs_info *fs_info = root->fs_info; 1444 struct btrfs_fs_info *fs_info = root->fs_info;
1207 1445
1208 mutex_lock(&fs_info->trans_mutex); 1446 spin_lock(&fs_info->trans_lock);
1209 list_splice_init(&fs_info->dead_roots, &list); 1447 list_splice_init(&fs_info->dead_roots, &list);
1210 mutex_unlock(&fs_info->trans_mutex); 1448 spin_unlock(&fs_info->trans_lock);
1211 1449
1212 while (!list_empty(&list)) { 1450 while (!list_empty(&list)) {
1213 root = list_entry(list.next, struct btrfs_root, root_list); 1451 root = list_entry(list.next, struct btrfs_root, root_list);
1214 list_del(&root->root_list); 1452 list_del(&root->root_list);
1215 1453
1454 btrfs_kill_all_delayed_nodes(root);
1455
1216 if (btrfs_header_backref_rev(root->node) < 1456 if (btrfs_header_backref_rev(root->node) <
1217 BTRFS_MIXED_BACKREF_REV) 1457 BTRFS_MIXED_BACKREF_REV)
1218 btrfs_drop_snapshot(root, NULL, 0); 1458 btrfs_drop_snapshot(root, NULL, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e104986d0bfd..02564e6230ac 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,13 @@ struct btrfs_transaction {
27 * total writers in this transaction, it must be zero before the 27 * total writers in this transaction, it must be zero before the
28 * transaction can end 28 * transaction can end
29 */ 29 */
30 unsigned long num_writers; 30 atomic_t num_writers;
31 atomic_t use_count;
31 32
32 unsigned long num_joined; 33 unsigned long num_joined;
34
35 spinlock_t commit_lock;
33 int in_commit; 36 int in_commit;
34 int use_count;
35 int commit_done; 37 int commit_done;
36 int blocked; 38 int blocked;
37 struct list_head list; 39 struct list_head list;
@@ -45,13 +47,14 @@ struct btrfs_transaction {
45 47
46struct btrfs_trans_handle { 48struct btrfs_trans_handle {
47 u64 transid; 49 u64 transid;
48 u64 block_group;
49 u64 bytes_reserved; 50 u64 bytes_reserved;
51 unsigned long use_count;
50 unsigned long blocks_reserved; 52 unsigned long blocks_reserved;
51 unsigned long blocks_used; 53 unsigned long blocks_used;
52 unsigned long delayed_ref_updates; 54 unsigned long delayed_ref_updates;
53 struct btrfs_transaction *transaction; 55 struct btrfs_transaction *transaction;
54 struct btrfs_block_rsv *block_rsv; 56 struct btrfs_block_rsv *block_rsv;
57 struct btrfs_block_rsv *orig_rsv;
55}; 58};
56 59
57struct btrfs_pending_snapshot { 60struct btrfs_pending_snapshot {
@@ -62,22 +65,10 @@ struct btrfs_pending_snapshot {
62 struct btrfs_block_rsv block_rsv; 65 struct btrfs_block_rsv block_rsv;
63 /* extra metadata reseration for relocation */ 66 /* extra metadata reseration for relocation */
64 int error; 67 int error;
68 bool readonly;
65 struct list_head list; 69 struct list_head list;
66}; 70};
67 71
68static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
69 struct inode *inode)
70{
71 trans->block_group = BTRFS_I(inode)->block_group;
72}
73
74static inline void btrfs_update_inode_block_group(
75 struct btrfs_trans_handle *trans,
76 struct inode *inode)
77{
78 BTRFS_I(inode)->block_group = trans->block_group;
79}
80
81static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, 72static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
82 struct inode *inode) 73 struct inode *inode)
83{ 74{
@@ -87,25 +78,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
87 78
88int btrfs_end_transaction(struct btrfs_trans_handle *trans, 79int btrfs_end_transaction(struct btrfs_trans_handle *trans,
89 struct btrfs_root *root); 80 struct btrfs_root *root);
81int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
82 struct btrfs_root *root);
90struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, 83struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
91 int num_items); 84 int num_items);
92struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, 85struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
93 int num_blocks); 86struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
94struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, 87struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
95 int num_blocks); 88int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
96int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 89int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root); 90 struct btrfs_root *root);
98int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root);
100 91
101int btrfs_add_dead_root(struct btrfs_root *root); 92int btrfs_add_dead_root(struct btrfs_root *root);
102int btrfs_drop_dead_root(struct btrfs_root *root);
103int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); 93int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
104int btrfs_clean_old_snapshots(struct btrfs_root *root); 94int btrfs_clean_old_snapshots(struct btrfs_root *root);
105int btrfs_commit_transaction(struct btrfs_trans_handle *trans, 95int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
106 struct btrfs_root *root); 96 struct btrfs_root *root);
97int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
98 struct btrfs_root *root,
99 int wait_for_unblock);
107int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, 100int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
108 struct btrfs_root *root); 101 struct btrfs_root *root);
102int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
103 struct btrfs_root *root);
109int btrfs_should_end_transaction(struct btrfs_trans_handle *trans, 104int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
110 struct btrfs_root *root); 105 struct btrfs_root *root);
111void btrfs_throttle(struct btrfs_root *root); 106void btrfs_throttle(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f7ac8e013ed7..3b580ee8ab1d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
36 int ret = 0; 36 int ret = 0;
37 int wret; 37 int wret;
38 int level; 38 int level;
39 int orig_level;
40 int is_extent = 0; 39 int is_extent = 0;
41 int next_key_ret = 0; 40 int next_key_ret = 0;
42 u64 last_ret = 0; 41 u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
64 return -ENOMEM; 63 return -ENOMEM;
65 64
66 level = btrfs_header_level(root->node); 65 level = btrfs_header_level(root->node);
67 orig_level = level;
68 66
69 if (level == 0) 67 if (level == 0)
70 goto out; 68 goto out;
@@ -99,7 +97,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
99 ret = 0; 97 ret = 0;
100 goto out; 98 goto out;
101 } 99 }
102 btrfs_release_path(root, path); 100 btrfs_release_path(path);
103 wret = btrfs_search_slot(trans, root, &key, path, 0, 1); 101 wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
104 102
105 if (wret < 0) { 103 if (wret < 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb102a9aee9c..4ce8a9f41d1e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -333,11 +333,17 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
333 goto insert; 333 goto insert;
334 334
335 if (item_size == 0) { 335 if (item_size == 0) {
336 btrfs_release_path(root, path); 336 btrfs_release_path(path);
337 return 0; 337 return 0;
338 } 338 }
339 dst_copy = kmalloc(item_size, GFP_NOFS); 339 dst_copy = kmalloc(item_size, GFP_NOFS);
340 src_copy = kmalloc(item_size, GFP_NOFS); 340 src_copy = kmalloc(item_size, GFP_NOFS);
341 if (!dst_copy || !src_copy) {
342 btrfs_release_path(path);
343 kfree(dst_copy);
344 kfree(src_copy);
345 return -ENOMEM;
346 }
341 347
342 read_extent_buffer(eb, src_copy, src_ptr, item_size); 348 read_extent_buffer(eb, src_copy, src_ptr, item_size);
343 349
@@ -355,13 +361,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
355 * sync 361 * sync
356 */ 362 */
357 if (ret == 0) { 363 if (ret == 0) {
358 btrfs_release_path(root, path); 364 btrfs_release_path(path);
359 return 0; 365 return 0;
360 } 366 }
361 367
362 } 368 }
363insert: 369insert:
364 btrfs_release_path(root, path); 370 btrfs_release_path(path);
365 /* try to insert the key into the destination tree */ 371 /* try to insert the key into the destination tree */
366 ret = btrfs_insert_empty_item(trans, root, path, 372 ret = btrfs_insert_empty_item(trans, root, path,
367 key, item_size); 373 key, item_size);
@@ -376,7 +382,6 @@ insert:
376 } else if (found_size < item_size) { 382 } else if (found_size < item_size) {
377 ret = btrfs_extend_item(trans, root, path, 383 ret = btrfs_extend_item(trans, root, path,
378 item_size - found_size); 384 item_size - found_size);
379 BUG_ON(ret);
380 } 385 }
381 } else if (ret) { 386 } else if (ret) {
382 return ret; 387 return ret;
@@ -432,7 +437,7 @@ insert:
432 } 437 }
433no_copy: 438no_copy:
434 btrfs_mark_buffer_dirty(path->nodes[0]); 439 btrfs_mark_buffer_dirty(path->nodes[0]);
435 btrfs_release_path(root, path); 440 btrfs_release_path(path);
436 return 0; 441 return 0;
437} 442}
438 443
@@ -513,7 +518,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
513 * file. This must be done before the btrfs_drop_extents run 518 * file. This must be done before the btrfs_drop_extents run
514 * so we don't try to drop this extent. 519 * so we don't try to drop this extent.
515 */ 520 */
516 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, 521 ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
517 start, 0); 522 start, 0);
518 523
519 if (ret == 0 && 524 if (ret == 0 &&
@@ -538,11 +543,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
538 * we don't have to do anything 543 * we don't have to do anything
539 */ 544 */
540 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { 545 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
541 btrfs_release_path(root, path); 546 btrfs_release_path(path);
542 goto out; 547 goto out;
543 } 548 }
544 } 549 }
545 btrfs_release_path(root, path); 550 btrfs_release_path(path);
546 551
547 saved_nbytes = inode_get_bytes(inode); 552 saved_nbytes = inode_get_bytes(inode);
548 /* drop any overlapping extents */ 553 /* drop any overlapping extents */
@@ -584,6 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
584 ins.objectid, ins.offset, 589 ins.objectid, ins.offset,
585 0, root->root_key.objectid, 590 0, root->root_key.objectid,
586 key->objectid, offset); 591 key->objectid, offset);
592 BUG_ON(ret);
587 } else { 593 } else {
588 /* 594 /*
589 * insert the extent pointer in the extent 595 * insert the extent pointer in the extent
@@ -594,7 +600,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
594 key->objectid, offset, &ins); 600 key->objectid, offset, &ins);
595 BUG_ON(ret); 601 BUG_ON(ret);
596 } 602 }
597 btrfs_release_path(root, path); 603 btrfs_release_path(path);
598 604
599 if (btrfs_file_extent_compression(eb, item)) { 605 if (btrfs_file_extent_compression(eb, item)) {
600 csum_start = ins.objectid; 606 csum_start = ins.objectid;
@@ -608,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
608 614
609 ret = btrfs_lookup_csums_range(root->log_root, 615 ret = btrfs_lookup_csums_range(root->log_root,
610 csum_start, csum_end - 1, 616 csum_start, csum_end - 1,
611 &ordered_sums); 617 &ordered_sums, 0);
612 BUG_ON(ret); 618 BUG_ON(ret);
613 while (!list_empty(&ordered_sums)) { 619 while (!list_empty(&ordered_sums)) {
614 struct btrfs_ordered_sum *sums; 620 struct btrfs_ordered_sum *sums;
@@ -623,7 +629,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
623 kfree(sums); 629 kfree(sums);
624 } 630 }
625 } else { 631 } else {
626 btrfs_release_path(root, path); 632 btrfs_release_path(path);
627 } 633 }
628 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { 634 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
629 /* inline extents are easy, we just overwrite them */ 635 /* inline extents are easy, we just overwrite them */
@@ -665,11 +671,17 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
665 btrfs_dir_item_key_to_cpu(leaf, di, &location); 671 btrfs_dir_item_key_to_cpu(leaf, di, &location);
666 name_len = btrfs_dir_name_len(leaf, di); 672 name_len = btrfs_dir_name_len(leaf, di);
667 name = kmalloc(name_len, GFP_NOFS); 673 name = kmalloc(name_len, GFP_NOFS);
674 if (!name)
675 return -ENOMEM;
676
668 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); 677 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
669 btrfs_release_path(root, path); 678 btrfs_release_path(path);
670 679
671 inode = read_one_inode(root, location.objectid); 680 inode = read_one_inode(root, location.objectid);
672 BUG_ON(!inode); 681 if (!inode) {
682 kfree(name);
683 return -EIO;
684 }
673 685
674 ret = link_to_fixup_dir(trans, root, path, location.objectid); 686 ret = link_to_fixup_dir(trans, root, path, location.objectid);
675 BUG_ON(ret); 687 BUG_ON(ret);
@@ -704,7 +716,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
704 goto out; 716 goto out;
705 } else 717 } else
706 goto out; 718 goto out;
707 btrfs_release_path(root, path); 719 btrfs_release_path(path);
708 720
709 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); 721 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
710 if (di && !IS_ERR(di)) { 722 if (di && !IS_ERR(di)) {
@@ -715,7 +727,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
715 goto out; 727 goto out;
716 match = 1; 728 match = 1;
717out: 729out:
718 btrfs_release_path(root, path); 730 btrfs_release_path(path);
719 return match; 731 return match;
720} 732}
721 733
@@ -744,6 +756,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
744 int match = 0; 756 int match = 0;
745 757
746 path = btrfs_alloc_path(); 758 path = btrfs_alloc_path();
759 if (!path)
760 return -ENOMEM;
761
747 ret = btrfs_search_slot(NULL, log, key, path, 0, 0); 762 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
748 if (ret != 0) 763 if (ret != 0)
749 goto out; 764 goto out;
@@ -786,18 +801,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
786{ 801{
787 struct inode *dir; 802 struct inode *dir;
788 int ret; 803 int ret;
789 struct btrfs_key location;
790 struct btrfs_inode_ref *ref; 804 struct btrfs_inode_ref *ref;
791 struct btrfs_dir_item *di;
792 struct inode *inode; 805 struct inode *inode;
793 char *name; 806 char *name;
794 int namelen; 807 int namelen;
795 unsigned long ref_ptr; 808 unsigned long ref_ptr;
796 unsigned long ref_end; 809 unsigned long ref_end;
797 810 int search_done = 0;
798 location.objectid = key->objectid;
799 location.type = BTRFS_INODE_ITEM_KEY;
800 location.offset = 0;
801 811
802 /* 812 /*
803 * it is possible that we didn't log all the parent directories 813 * it is possible that we didn't log all the parent directories
@@ -810,7 +820,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
810 return -ENOENT; 820 return -ENOENT;
811 821
812 inode = read_one_inode(root, key->objectid); 822 inode = read_one_inode(root, key->objectid);
813 BUG_ON(!inode); 823 if (!inode) {
824 iput(dir);
825 return -EIO;
826 }
814 827
815 ref_ptr = btrfs_item_ptr_offset(eb, slot); 828 ref_ptr = btrfs_item_ptr_offset(eb, slot);
816 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); 829 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
@@ -825,7 +838,7 @@ again:
825 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); 838 read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
826 839
827 /* if we already have a perfect match, we're done */ 840 /* if we already have a perfect match, we're done */
828 if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, 841 if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
829 btrfs_inode_ref_index(eb, ref), 842 btrfs_inode_ref_index(eb, ref),
830 name, namelen)) { 843 name, namelen)) {
831 goto out; 844 goto out;
@@ -838,7 +851,10 @@ again:
838 * existing back reference, and we don't want to create 851 * existing back reference, and we don't want to create
839 * dangling pointers in the directory. 852 * dangling pointers in the directory.
840 */ 853 */
841conflict_again: 854
855 if (search_done)
856 goto insert;
857
842 ret = btrfs_search_slot(NULL, root, key, path, 0, 0); 858 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
843 if (ret == 0) { 859 if (ret == 0) {
844 char *victim_name; 860 char *victim_name;
@@ -874,42 +890,26 @@ conflict_again:
874 if (!backref_in_log(log, key, victim_name, 890 if (!backref_in_log(log, key, victim_name,
875 victim_name_len)) { 891 victim_name_len)) {
876 btrfs_inc_nlink(inode); 892 btrfs_inc_nlink(inode);
877 btrfs_release_path(root, path); 893 btrfs_release_path(path);
878 894
879 ret = btrfs_unlink_inode(trans, root, dir, 895 ret = btrfs_unlink_inode(trans, root, dir,
880 inode, victim_name, 896 inode, victim_name,
881 victim_name_len); 897 victim_name_len);
882 kfree(victim_name);
883 btrfs_release_path(root, path);
884 goto conflict_again;
885 } 898 }
886 kfree(victim_name); 899 kfree(victim_name);
887 ptr = (unsigned long)(victim_ref + 1) + victim_name_len; 900 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
888 } 901 }
889 BUG_ON(ret); 902 BUG_ON(ret);
890 }
891 btrfs_release_path(root, path);
892 903
893 /* look for a conflicting sequence number */ 904 /*
894 di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, 905 * NOTE: we have searched root tree and checked the
895 btrfs_inode_ref_index(eb, ref), 906 * coresponding ref, it does not need to check again.
896 name, namelen, 0); 907 */
897 if (di && !IS_ERR(di)) { 908 search_done = 1;
898 ret = drop_one_dir_item(trans, root, path, dir, di);
899 BUG_ON(ret);
900 }
901 btrfs_release_path(root, path);
902
903
904 /* look for a conflicting name */
905 di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
906 name, namelen, 0);
907 if (di && !IS_ERR(di)) {
908 ret = drop_one_dir_item(trans, root, path, dir, di);
909 BUG_ON(ret);
910 } 909 }
911 btrfs_release_path(root, path); 910 btrfs_release_path(path);
912 911
912insert:
913 /* insert our name */ 913 /* insert our name */
914 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, 914 ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
915 btrfs_inode_ref_index(eb, ref)); 915 btrfs_inode_ref_index(eb, ref));
@@ -928,7 +928,7 @@ out:
928 BUG_ON(ret); 928 BUG_ON(ret);
929 929
930out_nowrite: 930out_nowrite:
931 btrfs_release_path(root, path); 931 btrfs_release_path(path);
932 iput(dir); 932 iput(dir);
933 iput(inode); 933 iput(inode);
934 return 0; 934 return 0;
@@ -966,12 +966,15 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
966 unsigned long ptr; 966 unsigned long ptr;
967 unsigned long ptr_end; 967 unsigned long ptr_end;
968 int name_len; 968 int name_len;
969 u64 ino = btrfs_ino(inode);
969 970
970 key.objectid = inode->i_ino; 971 key.objectid = ino;
971 key.type = BTRFS_INODE_REF_KEY; 972 key.type = BTRFS_INODE_REF_KEY;
972 key.offset = (u64)-1; 973 key.offset = (u64)-1;
973 974
974 path = btrfs_alloc_path(); 975 path = btrfs_alloc_path();
976 if (!path)
977 return -ENOMEM;
975 978
976 while (1) { 979 while (1) {
977 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 980 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -984,7 +987,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
984 } 987 }
985 btrfs_item_key_to_cpu(path->nodes[0], &key, 988 btrfs_item_key_to_cpu(path->nodes[0], &key,
986 path->slots[0]); 989 path->slots[0]);
987 if (key.objectid != inode->i_ino || 990 if (key.objectid != ino ||
988 key.type != BTRFS_INODE_REF_KEY) 991 key.type != BTRFS_INODE_REF_KEY)
989 break; 992 break;
990 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); 993 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
@@ -1003,9 +1006,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1003 if (key.offset == 0) 1006 if (key.offset == 0)
1004 break; 1007 break;
1005 key.offset--; 1008 key.offset--;
1006 btrfs_release_path(root, path); 1009 btrfs_release_path(path);
1007 } 1010 }
1008 btrfs_release_path(root, path); 1011 btrfs_release_path(path);
1009 if (nlink != inode->i_nlink) { 1012 if (nlink != inode->i_nlink) {
1010 inode->i_nlink = nlink; 1013 inode->i_nlink = nlink;
1011 btrfs_update_inode(trans, root, inode); 1014 btrfs_update_inode(trans, root, inode);
@@ -1015,10 +1018,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1015 if (inode->i_nlink == 0) { 1018 if (inode->i_nlink == 0) {
1016 if (S_ISDIR(inode->i_mode)) { 1019 if (S_ISDIR(inode->i_mode)) {
1017 ret = replay_dir_deletes(trans, root, NULL, path, 1020 ret = replay_dir_deletes(trans, root, NULL, path,
1018 inode->i_ino, 1); 1021 ino, 1);
1019 BUG_ON(ret); 1022 BUG_ON(ret);
1020 } 1023 }
1021 ret = insert_orphan_item(trans, root, inode->i_ino); 1024 ret = insert_orphan_item(trans, root, ino);
1022 BUG_ON(ret); 1025 BUG_ON(ret);
1023 } 1026 }
1024 btrfs_free_path(path); 1027 btrfs_free_path(path);
@@ -1054,11 +1057,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1054 break; 1057 break;
1055 1058
1056 ret = btrfs_del_item(trans, root, path); 1059 ret = btrfs_del_item(trans, root, path);
1057 BUG_ON(ret); 1060 if (ret)
1061 goto out;
1058 1062
1059 btrfs_release_path(root, path); 1063 btrfs_release_path(path);
1060 inode = read_one_inode(root, key.offset); 1064 inode = read_one_inode(root, key.offset);
1061 BUG_ON(!inode); 1065 if (!inode)
1066 return -EIO;
1062 1067
1063 ret = fixup_inode_link_count(trans, root, inode); 1068 ret = fixup_inode_link_count(trans, root, inode);
1064 BUG_ON(ret); 1069 BUG_ON(ret);
@@ -1072,8 +1077,10 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1072 */ 1077 */
1073 key.offset = (u64)-1; 1078 key.offset = (u64)-1;
1074 } 1079 }
1075 btrfs_release_path(root, path); 1080 ret = 0;
1076 return 0; 1081out:
1082 btrfs_release_path(path);
1083 return ret;
1077} 1084}
1078 1085
1079 1086
@@ -1092,7 +1099,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1092 struct inode *inode; 1099 struct inode *inode;
1093 1100
1094 inode = read_one_inode(root, objectid); 1101 inode = read_one_inode(root, objectid);
1095 BUG_ON(!inode); 1102 if (!inode)
1103 return -EIO;
1096 1104
1097 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; 1105 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1098 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); 1106 btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
@@ -1100,7 +1108,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1100 1108
1101 ret = btrfs_insert_empty_item(trans, root, path, &key, 0); 1109 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1102 1110
1103 btrfs_release_path(root, path); 1111 btrfs_release_path(path);
1104 if (ret == 0) { 1112 if (ret == 0) {
1105 btrfs_inc_nlink(inode); 1113 btrfs_inc_nlink(inode);
1106 btrfs_update_inode(trans, root, inode); 1114 btrfs_update_inode(trans, root, inode);
@@ -1179,10 +1187,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1179 int ret; 1187 int ret;
1180 1188
1181 dir = read_one_inode(root, key->objectid); 1189 dir = read_one_inode(root, key->objectid);
1182 BUG_ON(!dir); 1190 if (!dir)
1191 return -EIO;
1183 1192
1184 name_len = btrfs_dir_name_len(eb, di); 1193 name_len = btrfs_dir_name_len(eb, di);
1185 name = kmalloc(name_len, GFP_NOFS); 1194 name = kmalloc(name_len, GFP_NOFS);
1195 if (!name)
1196 return -ENOMEM;
1197
1186 log_type = btrfs_dir_type(eb, di); 1198 log_type = btrfs_dir_type(eb, di);
1187 read_extent_buffer(eb, name, (unsigned long)(di + 1), 1199 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1188 name_len); 1200 name_len);
@@ -1193,7 +1205,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1193 exists = 1; 1205 exists = 1;
1194 else 1206 else
1195 exists = 0; 1207 exists = 0;
1196 btrfs_release_path(root, path); 1208 btrfs_release_path(path);
1197 1209
1198 if (key->type == BTRFS_DIR_ITEM_KEY) { 1210 if (key->type == BTRFS_DIR_ITEM_KEY) {
1199 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, 1211 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1206,7 +1218,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1206 } else { 1218 } else {
1207 BUG(); 1219 BUG();
1208 } 1220 }
1209 if (!dst_di || IS_ERR(dst_di)) { 1221 if (IS_ERR_OR_NULL(dst_di)) {
1210 /* we need a sequence number to insert, so we only 1222 /* we need a sequence number to insert, so we only
1211 * do inserts for the BTRFS_DIR_INDEX_KEY types 1223 * do inserts for the BTRFS_DIR_INDEX_KEY types
1212 */ 1224 */
@@ -1237,13 +1249,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1237 if (key->type == BTRFS_DIR_INDEX_KEY) 1249 if (key->type == BTRFS_DIR_INDEX_KEY)
1238 goto insert; 1250 goto insert;
1239out: 1251out:
1240 btrfs_release_path(root, path); 1252 btrfs_release_path(path);
1241 kfree(name); 1253 kfree(name);
1242 iput(dir); 1254 iput(dir);
1243 return 0; 1255 return 0;
1244 1256
1245insert: 1257insert:
1246 btrfs_release_path(root, path); 1258 btrfs_release_path(path);
1247 ret = insert_one_name(trans, root, path, key->objectid, key->offset, 1259 ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1248 name, name_len, log_type, &log_key); 1260 name, name_len, log_type, &log_key);
1249 1261
@@ -1274,6 +1286,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1274 ptr_end = ptr + item_size; 1286 ptr_end = ptr + item_size;
1275 while (ptr < ptr_end) { 1287 while (ptr < ptr_end) {
1276 di = (struct btrfs_dir_item *)ptr; 1288 di = (struct btrfs_dir_item *)ptr;
1289 if (verify_dir_item(root, eb, di))
1290 return -EIO;
1277 name_len = btrfs_dir_name_len(eb, di); 1291 name_len = btrfs_dir_name_len(eb, di);
1278 ret = replay_one_name(trans, root, path, eb, di, key); 1292 ret = replay_one_name(trans, root, path, eb, di, key);
1279 BUG_ON(ret); 1293 BUG_ON(ret);
@@ -1362,7 +1376,7 @@ next:
1362 *end_ret = found_end; 1376 *end_ret = found_end;
1363 ret = 0; 1377 ret = 0;
1364out: 1378out:
1365 btrfs_release_path(root, path); 1379 btrfs_release_path(path);
1366 return ret; 1380 return ret;
1367} 1381}
1368 1382
@@ -1400,6 +1414,11 @@ again:
1400 ptr_end = ptr + item_size; 1414 ptr_end = ptr + item_size;
1401 while (ptr < ptr_end) { 1415 while (ptr < ptr_end) {
1402 di = (struct btrfs_dir_item *)ptr; 1416 di = (struct btrfs_dir_item *)ptr;
1417 if (verify_dir_item(root, eb, di)) {
1418 ret = -EIO;
1419 goto out;
1420 }
1421
1403 name_len = btrfs_dir_name_len(eb, di); 1422 name_len = btrfs_dir_name_len(eb, di);
1404 name = kmalloc(name_len, GFP_NOFS); 1423 name = kmalloc(name_len, GFP_NOFS);
1405 if (!name) { 1424 if (!name) {
@@ -1420,12 +1439,15 @@ again:
1420 dir_key->offset, 1439 dir_key->offset,
1421 name, name_len, 0); 1440 name, name_len, 0);
1422 } 1441 }
1423 if (!log_di || IS_ERR(log_di)) { 1442 if (IS_ERR_OR_NULL(log_di)) {
1424 btrfs_dir_item_key_to_cpu(eb, di, &location); 1443 btrfs_dir_item_key_to_cpu(eb, di, &location);
1425 btrfs_release_path(root, path); 1444 btrfs_release_path(path);
1426 btrfs_release_path(log, log_path); 1445 btrfs_release_path(log_path);
1427 inode = read_one_inode(root, location.objectid); 1446 inode = read_one_inode(root, location.objectid);
1428 BUG_ON(!inode); 1447 if (!inode) {
1448 kfree(name);
1449 return -EIO;
1450 }
1429 1451
1430 ret = link_to_fixup_dir(trans, root, 1452 ret = link_to_fixup_dir(trans, root,
1431 path, location.objectid); 1453 path, location.objectid);
@@ -1447,7 +1469,7 @@ again:
1447 ret = 0; 1469 ret = 0;
1448 goto out; 1470 goto out;
1449 } 1471 }
1450 btrfs_release_path(log, log_path); 1472 btrfs_release_path(log_path);
1451 kfree(name); 1473 kfree(name);
1452 1474
1453 ptr = (unsigned long)(di + 1); 1475 ptr = (unsigned long)(di + 1);
@@ -1455,8 +1477,8 @@ again:
1455 } 1477 }
1456 ret = 0; 1478 ret = 0;
1457out: 1479out:
1458 btrfs_release_path(root, path); 1480 btrfs_release_path(path);
1459 btrfs_release_path(log, log_path); 1481 btrfs_release_path(log_path);
1460 return ret; 1482 return ret;
1461} 1483}
1462 1484
@@ -1544,7 +1566,7 @@ again:
1544 break; 1566 break;
1545 dir_key.offset = found_key.offset + 1; 1567 dir_key.offset = found_key.offset + 1;
1546 } 1568 }
1547 btrfs_release_path(root, path); 1569 btrfs_release_path(path);
1548 if (range_end == (u64)-1) 1570 if (range_end == (u64)-1)
1549 break; 1571 break;
1550 range_start = range_end + 1; 1572 range_start = range_end + 1;
@@ -1555,11 +1577,11 @@ next_type:
1555 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { 1577 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1556 key_type = BTRFS_DIR_LOG_INDEX_KEY; 1578 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1557 dir_key.type = BTRFS_DIR_INDEX_KEY; 1579 dir_key.type = BTRFS_DIR_INDEX_KEY;
1558 btrfs_release_path(root, path); 1580 btrfs_release_path(path);
1559 goto again; 1581 goto again;
1560 } 1582 }
1561out: 1583out:
1562 btrfs_release_path(root, path); 1584 btrfs_release_path(path);
1563 btrfs_free_path(log_path); 1585 btrfs_free_path(log_path);
1564 iput(dir); 1586 iput(dir);
1565 return ret; 1587 return ret;
@@ -1583,7 +1605,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1583 struct btrfs_path *path; 1605 struct btrfs_path *path;
1584 struct btrfs_root *root = wc->replay_dest; 1606 struct btrfs_root *root = wc->replay_dest;
1585 struct btrfs_key key; 1607 struct btrfs_key key;
1586 u32 item_size;
1587 int level; 1608 int level;
1588 int i; 1609 int i;
1589 int ret; 1610 int ret;
@@ -1601,7 +1622,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1601 nritems = btrfs_header_nritems(eb); 1622 nritems = btrfs_header_nritems(eb);
1602 for (i = 0; i < nritems; i++) { 1623 for (i = 0; i < nritems; i++) {
1603 btrfs_item_key_to_cpu(eb, &key, i); 1624 btrfs_item_key_to_cpu(eb, &key, i);
1604 item_size = btrfs_item_size_nr(eb, i);
1605 1625
1606 /* inode keys are done during the first stage */ 1626 /* inode keys are done during the first stage */
1607 if (key.type == BTRFS_INODE_ITEM_KEY && 1627 if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1668,7 +1688,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1668 struct walk_control *wc) 1688 struct walk_control *wc)
1669{ 1689{
1670 u64 root_owner; 1690 u64 root_owner;
1671 u64 root_gen;
1672 u64 bytenr; 1691 u64 bytenr;
1673 u64 ptr_gen; 1692 u64 ptr_gen;
1674 struct extent_buffer *next; 1693 struct extent_buffer *next;
@@ -1698,9 +1717,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1698 1717
1699 parent = path->nodes[*level]; 1718 parent = path->nodes[*level];
1700 root_owner = btrfs_header_owner(parent); 1719 root_owner = btrfs_header_owner(parent);
1701 root_gen = btrfs_header_generation(parent);
1702 1720
1703 next = btrfs_find_create_tree_block(root, bytenr, blocksize); 1721 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1722 if (!next)
1723 return -ENOMEM;
1704 1724
1705 if (*level == 1) { 1725 if (*level == 1) {
1706 wc->process_func(root, next, wc, ptr_gen); 1726 wc->process_func(root, next, wc, ptr_gen);
@@ -1749,7 +1769,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1749 struct walk_control *wc) 1769 struct walk_control *wc)
1750{ 1770{
1751 u64 root_owner; 1771 u64 root_owner;
1752 u64 root_gen;
1753 int i; 1772 int i;
1754 int slot; 1773 int slot;
1755 int ret; 1774 int ret;
@@ -1757,8 +1776,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1757 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { 1776 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1758 slot = path->slots[i]; 1777 slot = path->slots[i];
1759 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) { 1778 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
1760 struct extent_buffer *node;
1761 node = path->nodes[i];
1762 path->slots[i]++; 1779 path->slots[i]++;
1763 *level = i; 1780 *level = i;
1764 WARN_ON(*level == 0); 1781 WARN_ON(*level == 0);
@@ -1771,7 +1788,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1771 parent = path->nodes[*level + 1]; 1788 parent = path->nodes[*level + 1];
1772 1789
1773 root_owner = btrfs_header_owner(parent); 1790 root_owner = btrfs_header_owner(parent);
1774 root_gen = btrfs_header_generation(parent);
1775 wc->process_func(root, path->nodes[*level], wc, 1791 wc->process_func(root, path->nodes[*level], wc,
1776 btrfs_header_generation(path->nodes[*level])); 1792 btrfs_header_generation(path->nodes[*level]));
1777 if (wc->free) { 1793 if (wc->free) {
@@ -1815,7 +1831,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1815 int orig_level; 1831 int orig_level;
1816 1832
1817 path = btrfs_alloc_path(); 1833 path = btrfs_alloc_path();
1818 BUG_ON(!path); 1834 if (!path)
1835 return -ENOMEM;
1819 1836
1820 level = btrfs_header_level(log->node); 1837 level = btrfs_header_level(log->node);
1821 orig_level = level; 1838 orig_level = level;
@@ -2045,6 +2062,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2045 wait_log_commit(trans, log_root_tree, 2062 wait_log_commit(trans, log_root_tree,
2046 log_root_tree->log_transid); 2063 log_root_tree->log_transid);
2047 mutex_unlock(&log_root_tree->log_mutex); 2064 mutex_unlock(&log_root_tree->log_mutex);
2065 ret = 0;
2048 goto out; 2066 goto out;
2049 } 2067 }
2050 atomic_set(&log_root_tree->log_commit[index2], 1); 2068 atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2091,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2091 * the running transaction open, so a full commit can't hop 2109 * the running transaction open, so a full commit can't hop
2092 * in and cause problems either. 2110 * in and cause problems either.
2093 */ 2111 */
2112 btrfs_scrub_pause_super(root);
2094 write_ctree_super(trans, root->fs_info->tree_root, 1); 2113 write_ctree_super(trans, root->fs_info->tree_root, 1);
2114 btrfs_scrub_continue_super(root);
2095 ret = 0; 2115 ret = 0;
2096 2116
2097 mutex_lock(&root->log_mutex); 2117 mutex_lock(&root->log_mutex);
@@ -2109,7 +2129,7 @@ out:
2109 smp_mb(); 2129 smp_mb();
2110 if (waitqueue_active(&root->log_commit_wait[index1])) 2130 if (waitqueue_active(&root->log_commit_wait[index1]))
2111 wake_up(&root->log_commit_wait[index1]); 2131 wake_up(&root->log_commit_wait[index1]);
2112 return 0; 2132 return ret;
2113} 2133}
2114 2134
2115static void free_log_tree(struct btrfs_trans_handle *trans, 2135static void free_log_tree(struct btrfs_trans_handle *trans,
@@ -2195,6 +2215,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2195 int ret; 2215 int ret;
2196 int err = 0; 2216 int err = 0;
2197 int bytes_del = 0; 2217 int bytes_del = 0;
2218 u64 dir_ino = btrfs_ino(dir);
2198 2219
2199 if (BTRFS_I(dir)->logged_trans < trans->transid) 2220 if (BTRFS_I(dir)->logged_trans < trans->transid)
2200 return 0; 2221 return 0;
@@ -2207,7 +2228,12 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2207 2228
2208 log = root->log_root; 2229 log = root->log_root;
2209 path = btrfs_alloc_path(); 2230 path = btrfs_alloc_path();
2210 di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, 2231 if (!path) {
2232 err = -ENOMEM;
2233 goto out_unlock;
2234 }
2235
2236 di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
2211 name, name_len, -1); 2237 name, name_len, -1);
2212 if (IS_ERR(di)) { 2238 if (IS_ERR(di)) {
2213 err = PTR_ERR(di); 2239 err = PTR_ERR(di);
@@ -2218,8 +2244,8 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2218 bytes_del += name_len; 2244 bytes_del += name_len;
2219 BUG_ON(ret); 2245 BUG_ON(ret);
2220 } 2246 }
2221 btrfs_release_path(log, path); 2247 btrfs_release_path(path);
2222 di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, 2248 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
2223 index, name, name_len, -1); 2249 index, name, name_len, -1);
2224 if (IS_ERR(di)) { 2250 if (IS_ERR(di)) {
2225 err = PTR_ERR(di); 2251 err = PTR_ERR(di);
@@ -2237,10 +2263,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2237 if (bytes_del) { 2263 if (bytes_del) {
2238 struct btrfs_key key; 2264 struct btrfs_key key;
2239 2265
2240 key.objectid = dir->i_ino; 2266 key.objectid = dir_ino;
2241 key.offset = 0; 2267 key.offset = 0;
2242 key.type = BTRFS_INODE_ITEM_KEY; 2268 key.type = BTRFS_INODE_ITEM_KEY;
2243 btrfs_release_path(log, path); 2269 btrfs_release_path(path);
2244 2270
2245 ret = btrfs_search_slot(trans, log, &key, path, 0, 1); 2271 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2246 if (ret < 0) { 2272 if (ret < 0) {
@@ -2262,10 +2288,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2262 btrfs_mark_buffer_dirty(path->nodes[0]); 2288 btrfs_mark_buffer_dirty(path->nodes[0]);
2263 } else 2289 } else
2264 ret = 0; 2290 ret = 0;
2265 btrfs_release_path(log, path); 2291 btrfs_release_path(path);
2266 } 2292 }
2267fail: 2293fail:
2268 btrfs_free_path(path); 2294 btrfs_free_path(path);
2295out_unlock:
2269 mutex_unlock(&BTRFS_I(dir)->log_mutex); 2296 mutex_unlock(&BTRFS_I(dir)->log_mutex);
2270 if (ret == -ENOSPC) { 2297 if (ret == -ENOSPC) {
2271 root->fs_info->last_trans_log_full_commit = trans->transid; 2298 root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -2273,7 +2300,7 @@ fail:
2273 } 2300 }
2274 btrfs_end_log_trans(root); 2301 btrfs_end_log_trans(root);
2275 2302
2276 return 0; 2303 return err;
2277} 2304}
2278 2305
2279/* see comments for btrfs_del_dir_entries_in_log */ 2306/* see comments for btrfs_del_dir_entries_in_log */
@@ -2295,7 +2322,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2295 log = root->log_root; 2322 log = root->log_root;
2296 mutex_lock(&BTRFS_I(inode)->log_mutex); 2323 mutex_lock(&BTRFS_I(inode)->log_mutex);
2297 2324
2298 ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, 2325 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
2299 dirid, &index); 2326 dirid, &index);
2300 mutex_unlock(&BTRFS_I(inode)->log_mutex); 2327 mutex_unlock(&BTRFS_I(inode)->log_mutex);
2301 if (ret == -ENOSPC) { 2328 if (ret == -ENOSPC) {
@@ -2336,7 +2363,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2336 struct btrfs_dir_log_item); 2363 struct btrfs_dir_log_item);
2337 btrfs_set_dir_log_end(path->nodes[0], item, last_offset); 2364 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2338 btrfs_mark_buffer_dirty(path->nodes[0]); 2365 btrfs_mark_buffer_dirty(path->nodes[0]);
2339 btrfs_release_path(log, path); 2366 btrfs_release_path(path);
2340 return 0; 2367 return 0;
2341} 2368}
2342 2369
@@ -2361,13 +2388,14 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2361 int nritems; 2388 int nritems;
2362 u64 first_offset = min_offset; 2389 u64 first_offset = min_offset;
2363 u64 last_offset = (u64)-1; 2390 u64 last_offset = (u64)-1;
2391 u64 ino = btrfs_ino(inode);
2364 2392
2365 log = root->log_root; 2393 log = root->log_root;
2366 max_key.objectid = inode->i_ino; 2394 max_key.objectid = ino;
2367 max_key.offset = (u64)-1; 2395 max_key.offset = (u64)-1;
2368 max_key.type = key_type; 2396 max_key.type = key_type;
2369 2397
2370 min_key.objectid = inode->i_ino; 2398 min_key.objectid = ino;
2371 min_key.type = key_type; 2399 min_key.type = key_type;
2372 min_key.offset = min_offset; 2400 min_key.offset = min_offset;
2373 2401
@@ -2380,18 +2408,17 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2380 * we didn't find anything from this transaction, see if there 2408 * we didn't find anything from this transaction, see if there
2381 * is anything at all 2409 * is anything at all
2382 */ 2410 */
2383 if (ret != 0 || min_key.objectid != inode->i_ino || 2411 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
2384 min_key.type != key_type) { 2412 min_key.objectid = ino;
2385 min_key.objectid = inode->i_ino;
2386 min_key.type = key_type; 2413 min_key.type = key_type;
2387 min_key.offset = (u64)-1; 2414 min_key.offset = (u64)-1;
2388 btrfs_release_path(root, path); 2415 btrfs_release_path(path);
2389 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2416 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2390 if (ret < 0) { 2417 if (ret < 0) {
2391 btrfs_release_path(root, path); 2418 btrfs_release_path(path);
2392 return ret; 2419 return ret;
2393 } 2420 }
2394 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2421 ret = btrfs_previous_item(root, path, ino, key_type);
2395 2422
2396 /* if ret == 0 there are items for this type, 2423 /* if ret == 0 there are items for this type,
2397 * create a range to tell us the last key of this type. 2424 * create a range to tell us the last key of this type.
@@ -2409,7 +2436,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2409 } 2436 }
2410 2437
2411 /* go backward to find any previous key */ 2438 /* go backward to find any previous key */
2412 ret = btrfs_previous_item(root, path, inode->i_ino, key_type); 2439 ret = btrfs_previous_item(root, path, ino, key_type);
2413 if (ret == 0) { 2440 if (ret == 0) {
2414 struct btrfs_key tmp; 2441 struct btrfs_key tmp;
2415 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2442 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
@@ -2424,7 +2451,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2424 } 2451 }
2425 } 2452 }
2426 } 2453 }
2427 btrfs_release_path(root, path); 2454 btrfs_release_path(path);
2428 2455
2429 /* find the first key from this transaction again */ 2456 /* find the first key from this transaction again */
2430 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); 2457 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -2444,8 +2471,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2444 for (i = path->slots[0]; i < nritems; i++) { 2471 for (i = path->slots[0]; i < nritems; i++) {
2445 btrfs_item_key_to_cpu(src, &min_key, i); 2472 btrfs_item_key_to_cpu(src, &min_key, i);
2446 2473
2447 if (min_key.objectid != inode->i_ino || 2474 if (min_key.objectid != ino || min_key.type != key_type)
2448 min_key.type != key_type)
2449 goto done; 2475 goto done;
2450 ret = overwrite_item(trans, log, dst_path, src, i, 2476 ret = overwrite_item(trans, log, dst_path, src, i,
2451 &min_key); 2477 &min_key);
@@ -2466,7 +2492,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2466 goto done; 2492 goto done;
2467 } 2493 }
2468 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); 2494 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2469 if (tmp.objectid != inode->i_ino || tmp.type != key_type) { 2495 if (tmp.objectid != ino || tmp.type != key_type) {
2470 last_offset = (u64)-1; 2496 last_offset = (u64)-1;
2471 goto done; 2497 goto done;
2472 } 2498 }
@@ -2482,8 +2508,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2482 } 2508 }
2483 } 2509 }
2484done: 2510done:
2485 btrfs_release_path(root, path); 2511 btrfs_release_path(path);
2486 btrfs_release_path(log, dst_path); 2512 btrfs_release_path(dst_path);
2487 2513
2488 if (err == 0) { 2514 if (err == 0) {
2489 *last_offset_ret = last_offset; 2515 *last_offset_ret = last_offset;
@@ -2492,8 +2518,7 @@ done:
2492 * is valid 2518 * is valid
2493 */ 2519 */
2494 ret = insert_dir_log_key(trans, log, path, key_type, 2520 ret = insert_dir_log_key(trans, log, path, key_type,
2495 inode->i_ino, first_offset, 2521 ino, first_offset, last_offset);
2496 last_offset);
2497 if (ret) 2522 if (ret)
2498 err = ret; 2523 err = ret;
2499 } 2524 }
@@ -2579,10 +2604,11 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
2579 break; 2604 break;
2580 2605
2581 ret = btrfs_del_item(trans, log, path); 2606 ret = btrfs_del_item(trans, log, path);
2582 BUG_ON(ret); 2607 if (ret)
2583 btrfs_release_path(log, path); 2608 break;
2609 btrfs_release_path(path);
2584 } 2610 }
2585 btrfs_release_path(log, path); 2611 btrfs_release_path(path);
2586 return ret; 2612 return ret;
2587} 2613}
2588 2614
@@ -2607,6 +2633,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2607 2633
2608 ins_data = kmalloc(nr * sizeof(struct btrfs_key) + 2634 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2609 nr * sizeof(u32), GFP_NOFS); 2635 nr * sizeof(u32), GFP_NOFS);
2636 if (!ins_data)
2637 return -ENOMEM;
2638
2610 ins_sizes = (u32 *)ins_data; 2639 ins_sizes = (u32 *)ins_data;
2611 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); 2640 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2612 2641
@@ -2654,6 +2683,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2654 extent = btrfs_item_ptr(src, start_slot + i, 2683 extent = btrfs_item_ptr(src, start_slot + i,
2655 struct btrfs_file_extent_item); 2684 struct btrfs_file_extent_item);
2656 2685
2686 if (btrfs_file_extent_generation(src, extent) < trans->transid)
2687 continue;
2688
2657 found_type = btrfs_file_extent_type(src, extent); 2689 found_type = btrfs_file_extent_type(src, extent);
2658 if (found_type == BTRFS_FILE_EXTENT_REG || 2690 if (found_type == BTRFS_FILE_EXTENT_REG ||
2659 found_type == BTRFS_FILE_EXTENT_PREALLOC) { 2691 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -2678,14 +2710,14 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
2678 ret = btrfs_lookup_csums_range( 2710 ret = btrfs_lookup_csums_range(
2679 log->fs_info->csum_root, 2711 log->fs_info->csum_root,
2680 ds + cs, ds + cs + cl - 1, 2712 ds + cs, ds + cs + cl - 1,
2681 &ordered_sums); 2713 &ordered_sums, 0);
2682 BUG_ON(ret); 2714 BUG_ON(ret);
2683 } 2715 }
2684 } 2716 }
2685 } 2717 }
2686 2718
2687 btrfs_mark_buffer_dirty(dst_path->nodes[0]); 2719 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2688 btrfs_release_path(log, dst_path); 2720 btrfs_release_path(dst_path);
2689 kfree(ins_data); 2721 kfree(ins_data);
2690 2722
2691 /* 2723 /*
@@ -2729,23 +2761,29 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2729 struct btrfs_key max_key; 2761 struct btrfs_key max_key;
2730 struct btrfs_root *log = root->log_root; 2762 struct btrfs_root *log = root->log_root;
2731 struct extent_buffer *src = NULL; 2763 struct extent_buffer *src = NULL;
2732 u32 size;
2733 int err = 0; 2764 int err = 0;
2734 int ret; 2765 int ret;
2735 int nritems; 2766 int nritems;
2736 int ins_start_slot = 0; 2767 int ins_start_slot = 0;
2737 int ins_nr; 2768 int ins_nr;
2769 u64 ino = btrfs_ino(inode);
2738 2770
2739 log = root->log_root; 2771 log = root->log_root;
2740 2772
2741 path = btrfs_alloc_path(); 2773 path = btrfs_alloc_path();
2774 if (!path)
2775 return -ENOMEM;
2742 dst_path = btrfs_alloc_path(); 2776 dst_path = btrfs_alloc_path();
2777 if (!dst_path) {
2778 btrfs_free_path(path);
2779 return -ENOMEM;
2780 }
2743 2781
2744 min_key.objectid = inode->i_ino; 2782 min_key.objectid = ino;
2745 min_key.type = BTRFS_INODE_ITEM_KEY; 2783 min_key.type = BTRFS_INODE_ITEM_KEY;
2746 min_key.offset = 0; 2784 min_key.offset = 0;
2747 2785
2748 max_key.objectid = inode->i_ino; 2786 max_key.objectid = ino;
2749 2787
2750 /* today the code can only do partial logging of directories */ 2788 /* today the code can only do partial logging of directories */
2751 if (!S_ISDIR(inode->i_mode)) 2789 if (!S_ISDIR(inode->i_mode))
@@ -2757,6 +2795,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2757 max_key.type = (u8)-1; 2795 max_key.type = (u8)-1;
2758 max_key.offset = (u64)-1; 2796 max_key.offset = (u64)-1;
2759 2797
2798 ret = btrfs_commit_inode_delayed_items(trans, inode);
2799 if (ret) {
2800 btrfs_free_path(path);
2801 btrfs_free_path(dst_path);
2802 return ret;
2803 }
2804
2760 mutex_lock(&BTRFS_I(inode)->log_mutex); 2805 mutex_lock(&BTRFS_I(inode)->log_mutex);
2761 2806
2762 /* 2807 /*
@@ -2768,8 +2813,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2768 2813
2769 if (inode_only == LOG_INODE_EXISTS) 2814 if (inode_only == LOG_INODE_EXISTS)
2770 max_key_type = BTRFS_XATTR_ITEM_KEY; 2815 max_key_type = BTRFS_XATTR_ITEM_KEY;
2771 ret = drop_objectid_items(trans, log, path, 2816 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
2772 inode->i_ino, max_key_type);
2773 } else { 2817 } else {
2774 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); 2818 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2775 } 2819 }
@@ -2787,13 +2831,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
2787 break; 2831 break;
2788again: 2832again:
2789 /* note, ins_nr might be > 0 here, cleanup outside the loop */ 2833 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2790 if (min_key.objectid != inode->i_ino) 2834 if (min_key.objectid != ino)
2791 break; 2835 break;
2792 if (min_key.type > max_key.type) 2836 if (min_key.type > max_key.type)
2793 break; 2837 break;
2794 2838
2795 src = path->nodes[0]; 2839 src = path->nodes[0];
2796 size = btrfs_item_size_nr(src, path->slots[0]);
2797 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { 2840 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2798 ins_nr++; 2841 ins_nr++;
2799 goto next_slot; 2842 goto next_slot;
@@ -2830,7 +2873,7 @@ next_slot:
2830 } 2873 }
2831 ins_nr = 0; 2874 ins_nr = 0;
2832 } 2875 }
2833 btrfs_release_path(root, path); 2876 btrfs_release_path(path);
2834 2877
2835 if (min_key.offset < (u64)-1) 2878 if (min_key.offset < (u64)-1)
2836 min_key.offset++; 2879 min_key.offset++;
@@ -2853,8 +2896,8 @@ next_slot:
2853 } 2896 }
2854 WARN_ON(ins_nr); 2897 WARN_ON(ins_nr);
2855 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { 2898 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2856 btrfs_release_path(root, path); 2899 btrfs_release_path(path);
2857 btrfs_release_path(log, dst_path); 2900 btrfs_release_path(dst_path);
2858 ret = log_directory_changes(trans, root, inode, path, dst_path); 2901 ret = log_directory_changes(trans, root, inode, path, dst_path);
2859 if (ret) { 2902 if (ret) {
2860 err = ret; 2903 err = ret;
@@ -2884,6 +2927,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2884{ 2927{
2885 int ret = 0; 2928 int ret = 0;
2886 struct btrfs_root *root; 2929 struct btrfs_root *root;
2930 struct dentry *old_parent = NULL;
2887 2931
2888 /* 2932 /*
2889 * for regular files, if its inode is already on disk, we don't 2933 * for regular files, if its inode is already on disk, we don't
@@ -2925,10 +2969,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
2925 if (IS_ROOT(parent)) 2969 if (IS_ROOT(parent))
2926 break; 2970 break;
2927 2971
2928 parent = parent->d_parent; 2972 parent = dget_parent(parent);
2973 dput(old_parent);
2974 old_parent = parent;
2929 inode = parent->d_inode; 2975 inode = parent->d_inode;
2930 2976
2931 } 2977 }
2978 dput(old_parent);
2932out: 2979out:
2933 return ret; 2980 return ret;
2934} 2981}
@@ -2960,6 +3007,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
2960{ 3007{
2961 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; 3008 int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
2962 struct super_block *sb; 3009 struct super_block *sb;
3010 struct dentry *old_parent = NULL;
2963 int ret = 0; 3011 int ret = 0;
2964 u64 last_committed = root->fs_info->last_trans_committed; 3012 u64 last_committed = root->fs_info->last_trans_committed;
2965 3013
@@ -3031,10 +3079,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
3031 if (IS_ROOT(parent)) 3079 if (IS_ROOT(parent))
3032 break; 3080 break;
3033 3081
3034 parent = parent->d_parent; 3082 parent = dget_parent(parent);
3083 dput(old_parent);
3084 old_parent = parent;
3035 } 3085 }
3036 ret = 0; 3086 ret = 0;
3037end_trans: 3087end_trans:
3088 dput(old_parent);
3038 if (ret < 0) { 3089 if (ret < 0) {
3039 BUG_ON(ret != -ENOSPC); 3090 BUG_ON(ret != -ENOSPC);
3040 root->fs_info->last_trans_log_full_commit = trans->transid; 3091 root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3054,8 +3105,13 @@ end_no_trans:
3054int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, 3105int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
3055 struct btrfs_root *root, struct dentry *dentry) 3106 struct btrfs_root *root, struct dentry *dentry)
3056{ 3107{
3057 return btrfs_log_inode_parent(trans, root, dentry->d_inode, 3108 struct dentry *parent = dget_parent(dentry);
3058 dentry->d_parent, 0); 3109 int ret;
3110
3111 ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
3112 dput(parent);
3113
3114 return ret;
3059} 3115}
3060 3116
3061/* 3117/*
@@ -3077,16 +3133,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
3077 .stage = 0, 3133 .stage = 0,
3078 }; 3134 };
3079 3135
3080 fs_info->log_root_recovering = 1;
3081 path = btrfs_alloc_path(); 3136 path = btrfs_alloc_path();
3082 BUG_ON(!path); 3137 if (!path)
3138 return -ENOMEM;
3139
3140 fs_info->log_root_recovering = 1;
3083 3141
3084 trans = btrfs_start_transaction(fs_info->tree_root, 0); 3142 trans = btrfs_start_transaction(fs_info->tree_root, 0);
3143 BUG_ON(IS_ERR(trans));
3085 3144
3086 wc.trans = trans; 3145 wc.trans = trans;
3087 wc.pin = 1; 3146 wc.pin = 1;
3088 3147
3089 walk_log_tree(trans, log_root_tree, &wc); 3148 ret = walk_log_tree(trans, log_root_tree, &wc);
3149 BUG_ON(ret);
3090 3150
3091again: 3151again:
3092 key.objectid = BTRFS_TREE_LOG_OBJECTID; 3152 key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3104,21 +3164,20 @@ again:
3104 } 3164 }
3105 btrfs_item_key_to_cpu(path->nodes[0], &found_key, 3165 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3106 path->slots[0]); 3166 path->slots[0]);
3107 btrfs_release_path(log_root_tree, path); 3167 btrfs_release_path(path);
3108 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) 3168 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
3109 break; 3169 break;
3110 3170
3111 log = btrfs_read_fs_root_no_radix(log_root_tree, 3171 log = btrfs_read_fs_root_no_radix(log_root_tree,
3112 &found_key); 3172 &found_key);
3113 BUG_ON(!log); 3173 BUG_ON(IS_ERR(log));
3114
3115 3174
3116 tmp_key.objectid = found_key.offset; 3175 tmp_key.objectid = found_key.offset;
3117 tmp_key.type = BTRFS_ROOT_ITEM_KEY; 3176 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
3118 tmp_key.offset = (u64)-1; 3177 tmp_key.offset = (u64)-1;
3119 3178
3120 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); 3179 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
3121 BUG_ON(!wc.replay_dest); 3180 BUG_ON(IS_ERR_OR_NULL(wc.replay_dest));
3122 3181
3123 wc.replay_dest->log_root = log; 3182 wc.replay_dest->log_root = log;
3124 btrfs_record_root_in_trans(trans, wc.replay_dest); 3183 btrfs_record_root_in_trans(trans, wc.replay_dest);
@@ -3140,7 +3199,7 @@ again:
3140 if (found_key.offset == 0) 3199 if (found_key.offset == 0)
3141 break; 3200 break;
3142 } 3201 }
3143 btrfs_release_path(log_root_tree, path); 3202 btrfs_release_path(path);
3144 3203
3145 /* step one is to pin it all, step two is to replay just inodes */ 3204 /* step one is to pin it all, step two is to replay just inodes */
3146 if (wc.pin) { 3205 if (wc.pin) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 3dfae84c8cc8..2270ac58d746 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -38,7 +38,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
38 struct btrfs_root *root, 38 struct btrfs_root *root,
39 const char *name, int name_len, 39 const char *name, int name_len,
40 struct inode *inode, u64 dirid); 40 struct inode *inode, u64 dirid);
41int btrfs_join_running_log_trans(struct btrfs_root *root);
42int btrfs_end_log_trans(struct btrfs_root *root); 41int btrfs_end_log_trans(struct btrfs_root *root);
43int btrfs_pin_log_trans(struct btrfs_root *root); 42int btrfs_pin_log_trans(struct btrfs_root *root);
44int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, 43int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
deleted file mode 100644
index 1ca1952fd917..000000000000
--- a/fs/btrfs/version.sh
+++ /dev/null
@@ -1,43 +0,0 @@
1#!/bin/bash
2#
3# determine-version -- report a useful version for releases
4#
5# Copyright 2008, Aron Griffis <agriffis@n01se.net>
6# Copyright 2008, Oracle
7# Released under the GNU GPLv2
8
9v="v0.16"
10
11which git &> /dev/null
12if [ $? == 0 ]; then
13 git branch >& /dev/null
14 if [ $? == 0 ]; then
15 if head=`git rev-parse --verify HEAD 2>/dev/null`; then
16 if tag=`git describe --tags 2>/dev/null`; then
17 v="$tag"
18 fi
19
20 # Are there uncommitted changes?
21 git update-index --refresh --unmerged > /dev/null
22 if git diff-index --name-only HEAD | \
23 grep -v "^scripts/package" \
24 | read dummy; then
25 v="$v"-dirty
26 fi
27 fi
28 fi
29fi
30
31echo "#ifndef __BUILD_VERSION" > .build-version.h
32echo "#define __BUILD_VERSION" >> .build-version.h
33echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
34echo "#endif" >> .build-version.h
35
36diff -q version.h .build-version.h >& /dev/null
37
38if [ $? == 0 ]; then
39 rm .build-version.h
40 exit 0
41fi
42
43mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..19450bc53632 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
22#include <linux/blkdev.h> 22#include <linux/blkdev.h>
23#include <linux/random.h> 23#include <linux/random.h>
24#include <linux/iocontext.h> 24#include <linux/iocontext.h>
25#include <linux/capability.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include "compat.h" 27#include "compat.h"
27#include "ctree.h" 28#include "ctree.h"
@@ -32,38 +33,14 @@
32#include "volumes.h" 33#include "volumes.h"
33#include "async-thread.h" 34#include "async-thread.h"
34 35
35struct map_lookup {
36 u64 type;
37 int io_align;
38 int io_width;
39 int stripe_len;
40 int sector_size;
41 int num_stripes;
42 int sub_stripes;
43 struct btrfs_bio_stripe stripes[];
44};
45
46static int init_first_rw_device(struct btrfs_trans_handle *trans, 36static int init_first_rw_device(struct btrfs_trans_handle *trans,
47 struct btrfs_root *root, 37 struct btrfs_root *root,
48 struct btrfs_device *device); 38 struct btrfs_device *device);
49static int btrfs_relocate_sys_chunks(struct btrfs_root *root); 39static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
50 40
51#define map_lookup_size(n) (sizeof(struct map_lookup) + \
52 (sizeof(struct btrfs_bio_stripe) * (n)))
53
54static DEFINE_MUTEX(uuid_mutex); 41static DEFINE_MUTEX(uuid_mutex);
55static LIST_HEAD(fs_uuids); 42static LIST_HEAD(fs_uuids);
56 43
57void btrfs_lock_volumes(void)
58{
59 mutex_lock(&uuid_mutex);
60}
61
62void btrfs_unlock_volumes(void)
63{
64 mutex_unlock(&uuid_mutex);
65}
66
67static void lock_chunks(struct btrfs_root *root) 44static void lock_chunks(struct btrfs_root *root)
68{ 45{
69 mutex_lock(&root->fs_info->chunk_mutex); 46 mutex_lock(&root->fs_info->chunk_mutex);
@@ -161,22 +138,25 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
161 struct bio *cur; 138 struct bio *cur;
162 int again = 0; 139 int again = 0;
163 unsigned long num_run; 140 unsigned long num_run;
164 unsigned long num_sync_run;
165 unsigned long batch_run = 0; 141 unsigned long batch_run = 0;
166 unsigned long limit; 142 unsigned long limit;
167 unsigned long last_waited = 0; 143 unsigned long last_waited = 0;
168 int force_reg = 0; 144 int force_reg = 0;
145 struct blk_plug plug;
146
147 /*
148 * this function runs all the bios we've collected for
149 * a particular device. We don't want to wander off to
150 * another device without first sending all of these down.
151 * So, setup a plug here and finish it off before we return
152 */
153 blk_start_plug(&plug);
169 154
170 bdi = blk_get_backing_dev_info(device->bdev); 155 bdi = blk_get_backing_dev_info(device->bdev);
171 fs_info = device->dev_root->fs_info; 156 fs_info = device->dev_root->fs_info;
172 limit = btrfs_async_submit_limit(fs_info); 157 limit = btrfs_async_submit_limit(fs_info);
173 limit = limit * 2 / 3; 158 limit = limit * 2 / 3;
174 159
175 /* we want to make sure that every time we switch from the sync
176 * list to the normal list, we unplug
177 */
178 num_sync_run = 0;
179
180loop: 160loop:
181 spin_lock(&device->io_lock); 161 spin_lock(&device->io_lock);
182 162
@@ -222,15 +202,6 @@ loop_lock:
222 202
223 spin_unlock(&device->io_lock); 203 spin_unlock(&device->io_lock);
224 204
225 /*
226 * if we're doing the regular priority list, make sure we unplug
227 * for any high prio bios we've sent down
228 */
229 if (pending_bios == &device->pending_bios && num_sync_run > 0) {
230 num_sync_run = 0;
231 blk_run_backing_dev(bdi, NULL);
232 }
233
234 while (pending) { 205 while (pending) {
235 206
236 rmb(); 207 rmb();
@@ -258,19 +229,11 @@ loop_lock:
258 229
259 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 230 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
260 231
261 if (cur->bi_rw & REQ_SYNC)
262 num_sync_run++;
263
264 submit_bio(cur->bi_rw, cur); 232 submit_bio(cur->bi_rw, cur);
265 num_run++; 233 num_run++;
266 batch_run++; 234 batch_run++;
267 if (need_resched()) { 235 if (need_resched())
268 if (num_sync_run) {
269 blk_run_backing_dev(bdi, NULL);
270 num_sync_run = 0;
271 }
272 cond_resched(); 236 cond_resched();
273 }
274 237
275 /* 238 /*
276 * we made progress, there is more work to do and the bdi 239 * we made progress, there is more work to do and the bdi
@@ -303,13 +266,8 @@ loop_lock:
303 * against it before looping 266 * against it before looping
304 */ 267 */
305 last_waited = ioc->last_waited; 268 last_waited = ioc->last_waited;
306 if (need_resched()) { 269 if (need_resched())
307 if (num_sync_run) {
308 blk_run_backing_dev(bdi, NULL);
309 num_sync_run = 0;
310 }
311 cond_resched(); 270 cond_resched();
312 }
313 continue; 271 continue;
314 } 272 }
315 spin_lock(&device->io_lock); 273 spin_lock(&device->io_lock);
@@ -322,22 +280,6 @@ loop_lock:
322 } 280 }
323 } 281 }
324 282
325 if (num_sync_run) {
326 num_sync_run = 0;
327 blk_run_backing_dev(bdi, NULL);
328 }
329 /*
330 * IO has already been through a long path to get here. Checksumming,
331 * async helper threads, perhaps compression. We've done a pretty
332 * good job of collecting a batch of IO and should just unplug
333 * the device right away.
334 *
335 * This will help anyone who is waiting on the IO, they might have
336 * already unplugged, but managed to do so before the bio they
337 * cared about found its way down here.
338 */
339 blk_run_backing_dev(bdi, NULL);
340
341 cond_resched(); 283 cond_resched();
342 if (again) 284 if (again)
343 goto loop; 285 goto loop;
@@ -348,6 +290,7 @@ loop_lock:
348 spin_unlock(&device->io_lock); 290 spin_unlock(&device->io_lock);
349 291
350done: 292done:
293 blk_finish_plug(&plug);
351 return 0; 294 return 0;
352} 295}
353 296
@@ -398,7 +341,6 @@ static noinline int device_list_add(const char *path,
398 device->work.func = pending_bios_fn; 341 device->work.func = pending_bios_fn;
399 memcpy(device->uuid, disk_super->dev_item.uuid, 342 memcpy(device->uuid, disk_super->dev_item.uuid,
400 BTRFS_UUID_SIZE); 343 BTRFS_UUID_SIZE);
401 device->barriers = 1;
402 spin_lock_init(&device->io_lock); 344 spin_lock_init(&device->io_lock);
403 device->name = kstrdup(path, GFP_NOFS); 345 device->name = kstrdup(path, GFP_NOFS);
404 if (!device->name) { 346 if (!device->name) {
@@ -408,17 +350,21 @@ static noinline int device_list_add(const char *path,
408 INIT_LIST_HEAD(&device->dev_alloc_list); 350 INIT_LIST_HEAD(&device->dev_alloc_list);
409 351
410 mutex_lock(&fs_devices->device_list_mutex); 352 mutex_lock(&fs_devices->device_list_mutex);
411 list_add(&device->dev_list, &fs_devices->devices); 353 list_add_rcu(&device->dev_list, &fs_devices->devices);
412 mutex_unlock(&fs_devices->device_list_mutex); 354 mutex_unlock(&fs_devices->device_list_mutex);
413 355
414 device->fs_devices = fs_devices; 356 device->fs_devices = fs_devices;
415 fs_devices->num_devices++; 357 fs_devices->num_devices++;
416 } else if (strcmp(device->name, path)) { 358 } else if (!device->name || strcmp(device->name, path)) {
417 name = kstrdup(path, GFP_NOFS); 359 name = kstrdup(path, GFP_NOFS);
418 if (!name) 360 if (!name)
419 return -ENOMEM; 361 return -ENOMEM;
420 kfree(device->name); 362 kfree(device->name);
421 device->name = name; 363 device->name = name;
364 if (device->missing) {
365 fs_devices->missing_devices--;
366 device->missing = 0;
367 }
422 } 368 }
423 369
424 if (found_transid > fs_devices->latest_trans) { 370 if (found_transid > fs_devices->latest_trans) {
@@ -447,7 +393,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
447 fs_devices->latest_trans = orig->latest_trans; 393 fs_devices->latest_trans = orig->latest_trans;
448 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); 394 memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
449 395
450 mutex_lock(&orig->device_list_mutex); 396 /* We have held the volume lock, it is safe to get the devices. */
451 list_for_each_entry(orig_dev, &orig->devices, dev_list) { 397 list_for_each_entry(orig_dev, &orig->devices, dev_list) {
452 device = kzalloc(sizeof(*device), GFP_NOFS); 398 device = kzalloc(sizeof(*device), GFP_NOFS);
453 if (!device) 399 if (!device)
@@ -462,7 +408,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
462 device->devid = orig_dev->devid; 408 device->devid = orig_dev->devid;
463 device->work.func = pending_bios_fn; 409 device->work.func = pending_bios_fn;
464 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); 410 memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
465 device->barriers = 1;
466 spin_lock_init(&device->io_lock); 411 spin_lock_init(&device->io_lock);
467 INIT_LIST_HEAD(&device->dev_list); 412 INIT_LIST_HEAD(&device->dev_list);
468 INIT_LIST_HEAD(&device->dev_alloc_list); 413 INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -471,10 +416,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
471 device->fs_devices = fs_devices; 416 device->fs_devices = fs_devices;
472 fs_devices->num_devices++; 417 fs_devices->num_devices++;
473 } 418 }
474 mutex_unlock(&orig->device_list_mutex);
475 return fs_devices; 419 return fs_devices;
476error: 420error:
477 mutex_unlock(&orig->device_list_mutex);
478 free_fs_devices(fs_devices); 421 free_fs_devices(fs_devices);
479 return ERR_PTR(-ENOMEM); 422 return ERR_PTR(-ENOMEM);
480} 423}
@@ -485,13 +428,13 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
485 428
486 mutex_lock(&uuid_mutex); 429 mutex_lock(&uuid_mutex);
487again: 430again:
488 mutex_lock(&fs_devices->device_list_mutex); 431 /* This is the initialized path, it is safe to release the devices. */
489 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { 432 list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
490 if (device->in_fs_metadata) 433 if (device->in_fs_metadata)
491 continue; 434 continue;
492 435
493 if (device->bdev) { 436 if (device->bdev) {
494 close_bdev_exclusive(device->bdev, device->mode); 437 blkdev_put(device->bdev, device->mode);
495 device->bdev = NULL; 438 device->bdev = NULL;
496 fs_devices->open_devices--; 439 fs_devices->open_devices--;
497 } 440 }
@@ -505,7 +448,6 @@ again:
505 kfree(device->name); 448 kfree(device->name);
506 kfree(device); 449 kfree(device);
507 } 450 }
508 mutex_unlock(&fs_devices->device_list_mutex);
509 451
510 if (fs_devices->seed) { 452 if (fs_devices->seed) {
511 fs_devices = fs_devices->seed; 453 fs_devices = fs_devices->seed;
@@ -516,6 +458,29 @@ again:
516 return 0; 458 return 0;
517} 459}
518 460
461static void __free_device(struct work_struct *work)
462{
463 struct btrfs_device *device;
464
465 device = container_of(work, struct btrfs_device, rcu_work);
466
467 if (device->bdev)
468 blkdev_put(device->bdev, device->mode);
469
470 kfree(device->name);
471 kfree(device);
472}
473
474static void free_device(struct rcu_head *head)
475{
476 struct btrfs_device *device;
477
478 device = container_of(head, struct btrfs_device, rcu);
479
480 INIT_WORK(&device->rcu_work, __free_device);
481 schedule_work(&device->rcu_work);
482}
483
519static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) 484static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
520{ 485{
521 struct btrfs_device *device; 486 struct btrfs_device *device;
@@ -523,20 +488,32 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
523 if (--fs_devices->opened > 0) 488 if (--fs_devices->opened > 0)
524 return 0; 489 return 0;
525 490
491 mutex_lock(&fs_devices->device_list_mutex);
526 list_for_each_entry(device, &fs_devices->devices, dev_list) { 492 list_for_each_entry(device, &fs_devices->devices, dev_list) {
527 if (device->bdev) { 493 struct btrfs_device *new_device;
528 close_bdev_exclusive(device->bdev, device->mode); 494
495 if (device->bdev)
529 fs_devices->open_devices--; 496 fs_devices->open_devices--;
530 } 497
531 if (device->writeable) { 498 if (device->writeable) {
532 list_del_init(&device->dev_alloc_list); 499 list_del_init(&device->dev_alloc_list);
533 fs_devices->rw_devices--; 500 fs_devices->rw_devices--;
534 } 501 }
535 502
536 device->bdev = NULL; 503 new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
537 device->writeable = 0; 504 BUG_ON(!new_device);
538 device->in_fs_metadata = 0; 505 memcpy(new_device, device, sizeof(*new_device));
506 new_device->name = kstrdup(device->name, GFP_NOFS);
507 BUG_ON(device->name && !new_device->name);
508 new_device->bdev = NULL;
509 new_device->writeable = 0;
510 new_device->in_fs_metadata = 0;
511 list_replace_rcu(&device->dev_list, &new_device->dev_list);
512
513 call_rcu(&device->rcu, free_device);
539 } 514 }
515 mutex_unlock(&fs_devices->device_list_mutex);
516
540 WARN_ON(fs_devices->open_devices); 517 WARN_ON(fs_devices->open_devices);
541 WARN_ON(fs_devices->rw_devices); 518 WARN_ON(fs_devices->rw_devices);
542 fs_devices->opened = 0; 519 fs_devices->opened = 0;
@@ -582,13 +559,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
582 int seeding = 1; 559 int seeding = 1;
583 int ret = 0; 560 int ret = 0;
584 561
562 flags |= FMODE_EXCL;
563
585 list_for_each_entry(device, head, dev_list) { 564 list_for_each_entry(device, head, dev_list) {
586 if (device->bdev) 565 if (device->bdev)
587 continue; 566 continue;
588 if (!device->name) 567 if (!device->name)
589 continue; 568 continue;
590 569
591 bdev = open_bdev_exclusive(device->name, flags, holder); 570 bdev = blkdev_get_by_path(device->name, flags, holder);
592 if (IS_ERR(bdev)) { 571 if (IS_ERR(bdev)) {
593 printk(KERN_INFO "open %s failed\n", device->name); 572 printk(KERN_INFO "open %s failed\n", device->name);
594 goto error; 573 goto error;
@@ -596,8 +575,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
596 set_blocksize(bdev, 4096); 575 set_blocksize(bdev, 4096);
597 576
598 bh = btrfs_read_dev_super(bdev); 577 bh = btrfs_read_dev_super(bdev);
599 if (!bh) 578 if (!bh) {
579 ret = -EINVAL;
600 goto error_close; 580 goto error_close;
581 }
601 582
602 disk_super = (struct btrfs_super_block *)bh->b_data; 583 disk_super = (struct btrfs_super_block *)bh->b_data;
603 devid = btrfs_stack_device_id(&disk_super->dev_item); 584 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -635,12 +616,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
635 list_add(&device->dev_alloc_list, 616 list_add(&device->dev_alloc_list,
636 &fs_devices->alloc_list); 617 &fs_devices->alloc_list);
637 } 618 }
619 brelse(bh);
638 continue; 620 continue;
639 621
640error_brelse: 622error_brelse:
641 brelse(bh); 623 brelse(bh);
642error_close: 624error_close:
643 close_bdev_exclusive(bdev, FMODE_READ); 625 blkdev_put(bdev, flags);
644error: 626error:
645 continue; 627 continue;
646 } 628 }
@@ -686,7 +668,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
686 668
687 mutex_lock(&uuid_mutex); 669 mutex_lock(&uuid_mutex);
688 670
689 bdev = open_bdev_exclusive(path, flags, holder); 671 flags |= FMODE_EXCL;
672 bdev = blkdev_get_by_path(path, flags, holder);
690 673
691 if (IS_ERR(bdev)) { 674 if (IS_ERR(bdev)) {
692 ret = PTR_ERR(bdev); 675 ret = PTR_ERR(bdev);
@@ -698,7 +681,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
698 goto error_close; 681 goto error_close;
699 bh = btrfs_read_dev_super(bdev); 682 bh = btrfs_read_dev_super(bdev);
700 if (!bh) { 683 if (!bh) {
701 ret = -EIO; 684 ret = -EINVAL;
702 goto error_close; 685 goto error_close;
703 } 686 }
704 disk_super = (struct btrfs_super_block *)bh->b_data; 687 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -706,77 +689,178 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
706 transid = btrfs_super_generation(disk_super); 689 transid = btrfs_super_generation(disk_super);
707 if (disk_super->label[0]) 690 if (disk_super->label[0])
708 printk(KERN_INFO "device label %s ", disk_super->label); 691 printk(KERN_INFO "device label %s ", disk_super->label);
709 else { 692 else
710 /* FIXME, make a readl uuid parser */ 693 printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
711 printk(KERN_INFO "device fsid %llx-%llx ",
712 *(unsigned long long *)disk_super->fsid,
713 *(unsigned long long *)(disk_super->fsid + 8));
714 }
715 printk(KERN_CONT "devid %llu transid %llu %s\n", 694 printk(KERN_CONT "devid %llu transid %llu %s\n",
716 (unsigned long long)devid, (unsigned long long)transid, path); 695 (unsigned long long)devid, (unsigned long long)transid, path);
717 ret = device_list_add(path, disk_super, devid, fs_devices_ret); 696 ret = device_list_add(path, disk_super, devid, fs_devices_ret);
718 697
719 brelse(bh); 698 brelse(bh);
720error_close: 699error_close:
721 close_bdev_exclusive(bdev, flags); 700 blkdev_put(bdev, flags);
722error: 701error:
723 mutex_unlock(&uuid_mutex); 702 mutex_unlock(&uuid_mutex);
724 return ret; 703 return ret;
725} 704}
726 705
706/* helper to account the used device space in the range */
707int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
708 u64 end, u64 *length)
709{
710 struct btrfs_key key;
711 struct btrfs_root *root = device->dev_root;
712 struct btrfs_dev_extent *dev_extent;
713 struct btrfs_path *path;
714 u64 extent_end;
715 int ret;
716 int slot;
717 struct extent_buffer *l;
718
719 *length = 0;
720
721 if (start >= device->total_bytes)
722 return 0;
723
724 path = btrfs_alloc_path();
725 if (!path)
726 return -ENOMEM;
727 path->reada = 2;
728
729 key.objectid = device->devid;
730 key.offset = start;
731 key.type = BTRFS_DEV_EXTENT_KEY;
732
733 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
734 if (ret < 0)
735 goto out;
736 if (ret > 0) {
737 ret = btrfs_previous_item(root, path, key.objectid, key.type);
738 if (ret < 0)
739 goto out;
740 }
741
742 while (1) {
743 l = path->nodes[0];
744 slot = path->slots[0];
745 if (slot >= btrfs_header_nritems(l)) {
746 ret = btrfs_next_leaf(root, path);
747 if (ret == 0)
748 continue;
749 if (ret < 0)
750 goto out;
751
752 break;
753 }
754 btrfs_item_key_to_cpu(l, &key, slot);
755
756 if (key.objectid < device->devid)
757 goto next;
758
759 if (key.objectid > device->devid)
760 break;
761
762 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
763 goto next;
764
765 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
766 extent_end = key.offset + btrfs_dev_extent_length(l,
767 dev_extent);
768 if (key.offset <= start && extent_end > end) {
769 *length = end - start + 1;
770 break;
771 } else if (key.offset <= start && extent_end > start)
772 *length += extent_end - start;
773 else if (key.offset > start && extent_end <= end)
774 *length += extent_end - key.offset;
775 else if (key.offset > start && key.offset <= end) {
776 *length += end - key.offset + 1;
777 break;
778 } else if (key.offset > end)
779 break;
780
781next:
782 path->slots[0]++;
783 }
784 ret = 0;
785out:
786 btrfs_free_path(path);
787 return ret;
788}
789
727/* 790/*
791 * find_free_dev_extent - find free space in the specified device
792 * @trans: transaction handler
793 * @device: the device which we search the free space in
794 * @num_bytes: the size of the free space that we need
795 * @start: store the start of the free space.
796 * @len: the size of the free space. that we find, or the size of the max
797 * free space if we don't find suitable free space
798 *
728 * this uses a pretty simple search, the expectation is that it is 799 * this uses a pretty simple search, the expectation is that it is
729 * called very infrequently and that a given device has a small number 800 * called very infrequently and that a given device has a small number
730 * of extents 801 * of extents
802 *
803 * @start is used to store the start of the free space if we find. But if we
804 * don't find suitable free space, it will be used to store the start position
805 * of the max free space.
806 *
807 * @len is used to store the size of the free space that we find.
808 * But if we don't find suitable free space, it is used to store the size of
809 * the max free space.
731 */ 810 */
732int find_free_dev_extent(struct btrfs_trans_handle *trans, 811int find_free_dev_extent(struct btrfs_trans_handle *trans,
733 struct btrfs_device *device, u64 num_bytes, 812 struct btrfs_device *device, u64 num_bytes,
734 u64 *start, u64 *max_avail) 813 u64 *start, u64 *len)
735{ 814{
736 struct btrfs_key key; 815 struct btrfs_key key;
737 struct btrfs_root *root = device->dev_root; 816 struct btrfs_root *root = device->dev_root;
738 struct btrfs_dev_extent *dev_extent = NULL; 817 struct btrfs_dev_extent *dev_extent;
739 struct btrfs_path *path; 818 struct btrfs_path *path;
740 u64 hole_size = 0; 819 u64 hole_size;
741 u64 last_byte = 0; 820 u64 max_hole_start;
742 u64 search_start = 0; 821 u64 max_hole_size;
822 u64 extent_end;
823 u64 search_start;
743 u64 search_end = device->total_bytes; 824 u64 search_end = device->total_bytes;
744 int ret; 825 int ret;
745 int slot = 0; 826 int slot;
746 int start_found;
747 struct extent_buffer *l; 827 struct extent_buffer *l;
748 828
749 path = btrfs_alloc_path();
750 if (!path)
751 return -ENOMEM;
752 path->reada = 2;
753 start_found = 0;
754
755 /* FIXME use last free of some kind */ 829 /* FIXME use last free of some kind */
756 830
757 /* we don't want to overwrite the superblock on the drive, 831 /* we don't want to overwrite the superblock on the drive,
758 * so we make sure to start at an offset of at least 1MB 832 * so we make sure to start at an offset of at least 1MB
759 */ 833 */
760 search_start = max((u64)1024 * 1024, search_start); 834 search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
835
836 max_hole_start = search_start;
837 max_hole_size = 0;
761 838
762 if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) 839 if (search_start >= search_end) {
763 search_start = max(root->fs_info->alloc_start, search_start); 840 ret = -ENOSPC;
841 goto error;
842 }
843
844 path = btrfs_alloc_path();
845 if (!path) {
846 ret = -ENOMEM;
847 goto error;
848 }
849 path->reada = 2;
764 850
765 key.objectid = device->devid; 851 key.objectid = device->devid;
766 key.offset = search_start; 852 key.offset = search_start;
767 key.type = BTRFS_DEV_EXTENT_KEY; 853 key.type = BTRFS_DEV_EXTENT_KEY;
854
768 ret = btrfs_search_slot(trans, root, &key, path, 0, 0); 855 ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
769 if (ret < 0) 856 if (ret < 0)
770 goto error; 857 goto out;
771 if (ret > 0) { 858 if (ret > 0) {
772 ret = btrfs_previous_item(root, path, key.objectid, key.type); 859 ret = btrfs_previous_item(root, path, key.objectid, key.type);
773 if (ret < 0) 860 if (ret < 0)
774 goto error; 861 goto out;
775 if (ret > 0)
776 start_found = 1;
777 } 862 }
778 l = path->nodes[0]; 863
779 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
780 while (1) { 864 while (1) {
781 l = path->nodes[0]; 865 l = path->nodes[0];
782 slot = path->slots[0]; 866 slot = path->slots[0];
@@ -785,24 +869,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
785 if (ret == 0) 869 if (ret == 0)
786 continue; 870 continue;
787 if (ret < 0) 871 if (ret < 0)
788 goto error; 872 goto out;
789no_more_items: 873
790 if (!start_found) { 874 break;
791 if (search_start >= search_end) {
792 ret = -ENOSPC;
793 goto error;
794 }
795 *start = search_start;
796 start_found = 1;
797 goto check_pending;
798 }
799 *start = last_byte > search_start ?
800 last_byte : search_start;
801 if (search_end <= *start) {
802 ret = -ENOSPC;
803 goto error;
804 }
805 goto check_pending;
806 } 875 }
807 btrfs_item_key_to_cpu(l, &key, slot); 876 btrfs_item_key_to_cpu(l, &key, slot);
808 877
@@ -810,48 +879,62 @@ no_more_items:
810 goto next; 879 goto next;
811 880
812 if (key.objectid > device->devid) 881 if (key.objectid > device->devid)
813 goto no_more_items; 882 break;
814 883
815 if (key.offset >= search_start && key.offset > last_byte && 884 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
816 start_found) { 885 goto next;
817 if (last_byte < search_start)
818 last_byte = search_start;
819 hole_size = key.offset - last_byte;
820 886
821 if (hole_size > *max_avail) 887 if (key.offset > search_start) {
822 *max_avail = hole_size; 888 hole_size = key.offset - search_start;
889
890 if (hole_size > max_hole_size) {
891 max_hole_start = search_start;
892 max_hole_size = hole_size;
893 }
823 894
824 if (key.offset > last_byte && 895 /*
825 hole_size >= num_bytes) { 896 * If this free space is greater than which we need,
826 *start = last_byte; 897 * it must be the max free space that we have found
827 goto check_pending; 898 * until now, so max_hole_start must point to the start
899 * of this free space and the length of this free space
900 * is stored in max_hole_size. Thus, we return
901 * max_hole_start and max_hole_size and go back to the
902 * caller.
903 */
904 if (hole_size >= num_bytes) {
905 ret = 0;
906 goto out;
828 } 907 }
829 } 908 }
830 if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
831 goto next;
832 909
833 start_found = 1;
834 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); 910 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
835 last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); 911 extent_end = key.offset + btrfs_dev_extent_length(l,
912 dev_extent);
913 if (extent_end > search_start)
914 search_start = extent_end;
836next: 915next:
837 path->slots[0]++; 916 path->slots[0]++;
838 cond_resched(); 917 cond_resched();
839 } 918 }
840check_pending:
841 /* we have to make sure we didn't find an extent that has already
842 * been allocated by the map tree or the original allocation
843 */
844 BUG_ON(*start < search_start);
845 919
846 if (*start + num_bytes > search_end) { 920 hole_size = search_end- search_start;
847 ret = -ENOSPC; 921 if (hole_size > max_hole_size) {
848 goto error; 922 max_hole_start = search_start;
923 max_hole_size = hole_size;
849 } 924 }
850 /* check for pending inserts here */
851 ret = 0;
852 925
853error: 926 /* See above. */
927 if (hole_size < num_bytes)
928 ret = -ENOSPC;
929 else
930 ret = 0;
931
932out:
854 btrfs_free_path(path); 933 btrfs_free_path(path);
934error:
935 *start = max_hole_start;
936 if (len)
937 *len = max_hole_size;
855 return ret; 938 return ret;
856} 939}
857 940
@@ -879,14 +962,14 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
879 if (ret > 0) { 962 if (ret > 0) {
880 ret = btrfs_previous_item(root, path, key.objectid, 963 ret = btrfs_previous_item(root, path, key.objectid,
881 BTRFS_DEV_EXTENT_KEY); 964 BTRFS_DEV_EXTENT_KEY);
882 BUG_ON(ret); 965 if (ret)
966 goto out;
883 leaf = path->nodes[0]; 967 leaf = path->nodes[0];
884 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 968 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
885 extent = btrfs_item_ptr(leaf, path->slots[0], 969 extent = btrfs_item_ptr(leaf, path->slots[0],
886 struct btrfs_dev_extent); 970 struct btrfs_dev_extent);
887 BUG_ON(found_key.offset > start || found_key.offset + 971 BUG_ON(found_key.offset > start || found_key.offset +
888 btrfs_dev_extent_length(leaf, extent) < start); 972 btrfs_dev_extent_length(leaf, extent) < start);
889 ret = 0;
890 } else if (ret == 0) { 973 } else if (ret == 0) {
891 leaf = path->nodes[0]; 974 leaf = path->nodes[0];
892 extent = btrfs_item_ptr(leaf, path->slots[0], 975 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -897,8 +980,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
897 if (device->bytes_used > 0) 980 if (device->bytes_used > 0)
898 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 981 device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
899 ret = btrfs_del_item(trans, root, path); 982 ret = btrfs_del_item(trans, root, path);
900 BUG_ON(ret);
901 983
984out:
902 btrfs_free_path(path); 985 btrfs_free_path(path);
903 return ret; 986 return ret;
904} 987}
@@ -1098,6 +1181,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
1098 return -ENOMEM; 1181 return -ENOMEM;
1099 1182
1100 trans = btrfs_start_transaction(root, 0); 1183 trans = btrfs_start_transaction(root, 0);
1184 if (IS_ERR(trans)) {
1185 btrfs_free_path(path);
1186 return PTR_ERR(trans);
1187 }
1101 key.objectid = BTRFS_DEV_ITEMS_OBJECTID; 1188 key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1102 key.type = BTRFS_DEV_ITEM_KEY; 1189 key.type = BTRFS_DEV_ITEM_KEY;
1103 key.offset = device->devid; 1190 key.offset = device->devid;
@@ -1129,11 +1216,13 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1129 struct block_device *bdev; 1216 struct block_device *bdev;
1130 struct buffer_head *bh = NULL; 1217 struct buffer_head *bh = NULL;
1131 struct btrfs_super_block *disk_super; 1218 struct btrfs_super_block *disk_super;
1219 struct btrfs_fs_devices *cur_devices;
1132 u64 all_avail; 1220 u64 all_avail;
1133 u64 devid; 1221 u64 devid;
1134 u64 num_devices; 1222 u64 num_devices;
1135 u8 *dev_uuid; 1223 u8 *dev_uuid;
1136 int ret = 0; 1224 int ret = 0;
1225 bool clear_super = false;
1137 1226
1138 mutex_lock(&uuid_mutex); 1227 mutex_lock(&uuid_mutex);
1139 mutex_lock(&root->fs_info->volume_mutex); 1228 mutex_lock(&root->fs_info->volume_mutex);
@@ -1164,14 +1253,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1164 1253
1165 device = NULL; 1254 device = NULL;
1166 devices = &root->fs_info->fs_devices->devices; 1255 devices = &root->fs_info->fs_devices->devices;
1167 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1256 /*
1257 * It is safe to read the devices since the volume_mutex
1258 * is held.
1259 */
1168 list_for_each_entry(tmp, devices, dev_list) { 1260 list_for_each_entry(tmp, devices, dev_list) {
1169 if (tmp->in_fs_metadata && !tmp->bdev) { 1261 if (tmp->in_fs_metadata && !tmp->bdev) {
1170 device = tmp; 1262 device = tmp;
1171 break; 1263 break;
1172 } 1264 }
1173 } 1265 }
1174 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1175 bdev = NULL; 1266 bdev = NULL;
1176 bh = NULL; 1267 bh = NULL;
1177 disk_super = NULL; 1268 disk_super = NULL;
@@ -1181,8 +1272,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1181 goto out; 1272 goto out;
1182 } 1273 }
1183 } else { 1274 } else {
1184 bdev = open_bdev_exclusive(device_path, FMODE_READ, 1275 bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
1185 root->fs_info->bdev_holder); 1276 root->fs_info->bdev_holder);
1186 if (IS_ERR(bdev)) { 1277 if (IS_ERR(bdev)) {
1187 ret = PTR_ERR(bdev); 1278 ret = PTR_ERR(bdev);
1188 goto out; 1279 goto out;
@@ -1191,7 +1282,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1191 set_blocksize(bdev, 4096); 1282 set_blocksize(bdev, 4096);
1192 bh = btrfs_read_dev_super(bdev); 1283 bh = btrfs_read_dev_super(bdev);
1193 if (!bh) { 1284 if (!bh) {
1194 ret = -EIO; 1285 ret = -EINVAL;
1195 goto error_close; 1286 goto error_close;
1196 } 1287 }
1197 disk_super = (struct btrfs_super_block *)bh->b_data; 1288 disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1213,31 +1304,39 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1213 } 1304 }
1214 1305
1215 if (device->writeable) { 1306 if (device->writeable) {
1307 lock_chunks(root);
1216 list_del_init(&device->dev_alloc_list); 1308 list_del_init(&device->dev_alloc_list);
1309 unlock_chunks(root);
1217 root->fs_info->fs_devices->rw_devices--; 1310 root->fs_info->fs_devices->rw_devices--;
1311 clear_super = true;
1218 } 1312 }
1219 1313
1220 ret = btrfs_shrink_device(device, 0); 1314 ret = btrfs_shrink_device(device, 0);
1221 if (ret) 1315 if (ret)
1222 goto error_brelse; 1316 goto error_undo;
1223 1317
1224 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); 1318 ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1225 if (ret) 1319 if (ret)
1226 goto error_brelse; 1320 goto error_undo;
1227 1321
1228 device->in_fs_metadata = 0; 1322 device->in_fs_metadata = 0;
1323 btrfs_scrub_cancel_dev(root, device);
1229 1324
1230 /* 1325 /*
1231 * the device list mutex makes sure that we don't change 1326 * the device list mutex makes sure that we don't change
1232 * the device list while someone else is writing out all 1327 * the device list while someone else is writing out all
1233 * the device supers. 1328 * the device supers.
1234 */ 1329 */
1330
1331 cur_devices = device->fs_devices;
1235 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1332 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1236 list_del_init(&device->dev_list); 1333 list_del_rcu(&device->dev_list);
1237 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1238 1334
1239 device->fs_devices->num_devices--; 1335 device->fs_devices->num_devices--;
1240 1336
1337 if (device->missing)
1338 root->fs_info->fs_devices->missing_devices--;
1339
1241 next_device = list_entry(root->fs_info->fs_devices->devices.next, 1340 next_device = list_entry(root->fs_info->fs_devices->devices.next,
1242 struct btrfs_device, dev_list); 1341 struct btrfs_device, dev_list);
1243 if (device->bdev == root->fs_info->sb->s_bdev) 1342 if (device->bdev == root->fs_info->sb->s_bdev)
@@ -1245,34 +1344,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1245 if (device->bdev == root->fs_info->fs_devices->latest_bdev) 1344 if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1246 root->fs_info->fs_devices->latest_bdev = next_device->bdev; 1345 root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1247 1346
1248 if (device->bdev) { 1347 if (device->bdev)
1249 close_bdev_exclusive(device->bdev, device->mode);
1250 device->bdev = NULL;
1251 device->fs_devices->open_devices--; 1348 device->fs_devices->open_devices--;
1252 } 1349
1350 call_rcu(&device->rcu, free_device);
1351 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1253 1352
1254 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1353 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
1255 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1354 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
1256 1355
1257 if (device->fs_devices->open_devices == 0) { 1356 if (cur_devices->open_devices == 0) {
1258 struct btrfs_fs_devices *fs_devices; 1357 struct btrfs_fs_devices *fs_devices;
1259 fs_devices = root->fs_info->fs_devices; 1358 fs_devices = root->fs_info->fs_devices;
1260 while (fs_devices) { 1359 while (fs_devices) {
1261 if (fs_devices->seed == device->fs_devices) 1360 if (fs_devices->seed == cur_devices)
1262 break; 1361 break;
1263 fs_devices = fs_devices->seed; 1362 fs_devices = fs_devices->seed;
1264 } 1363 }
1265 fs_devices->seed = device->fs_devices->seed; 1364 fs_devices->seed = cur_devices->seed;
1266 device->fs_devices->seed = NULL; 1365 cur_devices->seed = NULL;
1267 __btrfs_close_devices(device->fs_devices); 1366 lock_chunks(root);
1268 free_fs_devices(device->fs_devices); 1367 __btrfs_close_devices(cur_devices);
1368 unlock_chunks(root);
1369 free_fs_devices(cur_devices);
1269 } 1370 }
1270 1371
1271 /* 1372 /*
1272 * at this point, the device is zero sized. We want to 1373 * at this point, the device is zero sized. We want to
1273 * remove it from the devices list and zero out the old super 1374 * remove it from the devices list and zero out the old super
1274 */ 1375 */
1275 if (device->writeable) { 1376 if (clear_super) {
1276 /* make sure this device isn't detected as part of 1377 /* make sure this device isn't detected as part of
1277 * the FS anymore 1378 * the FS anymore
1278 */ 1379 */
@@ -1281,19 +1382,26 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1281 sync_dirty_buffer(bh); 1382 sync_dirty_buffer(bh);
1282 } 1383 }
1283 1384
1284 kfree(device->name);
1285 kfree(device);
1286 ret = 0; 1385 ret = 0;
1287 1386
1288error_brelse: 1387error_brelse:
1289 brelse(bh); 1388 brelse(bh);
1290error_close: 1389error_close:
1291 if (bdev) 1390 if (bdev)
1292 close_bdev_exclusive(bdev, FMODE_READ); 1391 blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1293out: 1392out:
1294 mutex_unlock(&root->fs_info->volume_mutex); 1393 mutex_unlock(&root->fs_info->volume_mutex);
1295 mutex_unlock(&uuid_mutex); 1394 mutex_unlock(&uuid_mutex);
1296 return ret; 1395 return ret;
1396error_undo:
1397 if (device->writeable) {
1398 lock_chunks(root);
1399 list_add(&device->dev_alloc_list,
1400 &root->fs_info->fs_devices->alloc_list);
1401 unlock_chunks(root);
1402 root->fs_info->fs_devices->rw_devices++;
1403 }
1404 goto error_brelse;
1297} 1405}
1298 1406
1299/* 1407/*
@@ -1330,7 +1438,12 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1330 INIT_LIST_HEAD(&seed_devices->devices); 1438 INIT_LIST_HEAD(&seed_devices->devices);
1331 INIT_LIST_HEAD(&seed_devices->alloc_list); 1439 INIT_LIST_HEAD(&seed_devices->alloc_list);
1332 mutex_init(&seed_devices->device_list_mutex); 1440 mutex_init(&seed_devices->device_list_mutex);
1333 list_splice_init(&fs_devices->devices, &seed_devices->devices); 1441
1442 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1443 list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1444 synchronize_rcu);
1445 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1446
1334 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); 1447 list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1335 list_for_each_entry(device, &seed_devices->devices, dev_list) { 1448 list_for_each_entry(device, &seed_devices->devices, dev_list) {
1336 device->fs_devices = seed_devices; 1449 device->fs_devices = seed_devices;
@@ -1391,7 +1504,7 @@ next_slot:
1391 goto error; 1504 goto error;
1392 leaf = path->nodes[0]; 1505 leaf = path->nodes[0];
1393 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); 1506 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1394 btrfs_release_path(root, path); 1507 btrfs_release_path(path);
1395 continue; 1508 continue;
1396 } 1509 }
1397 1510
@@ -1441,7 +1554,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1441 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) 1554 if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
1442 return -EINVAL; 1555 return -EINVAL;
1443 1556
1444 bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); 1557 bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
1558 root->fs_info->bdev_holder);
1445 if (IS_ERR(bdev)) 1559 if (IS_ERR(bdev))
1446 return PTR_ERR(bdev); 1560 return PTR_ERR(bdev);
1447 1561
@@ -1482,14 +1596,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1482 1596
1483 ret = find_next_devid(root, &device->devid); 1597 ret = find_next_devid(root, &device->devid);
1484 if (ret) { 1598 if (ret) {
1599 kfree(device->name);
1485 kfree(device); 1600 kfree(device);
1486 goto error; 1601 goto error;
1487 } 1602 }
1488 1603
1489 trans = btrfs_start_transaction(root, 0); 1604 trans = btrfs_start_transaction(root, 0);
1605 if (IS_ERR(trans)) {
1606 kfree(device->name);
1607 kfree(device);
1608 ret = PTR_ERR(trans);
1609 goto error;
1610 }
1611
1490 lock_chunks(root); 1612 lock_chunks(root);
1491 1613
1492 device->barriers = 1;
1493 device->writeable = 1; 1614 device->writeable = 1;
1494 device->work.func = pending_bios_fn; 1615 device->work.func = pending_bios_fn;
1495 generate_random_uuid(device->uuid); 1616 generate_random_uuid(device->uuid);
@@ -1503,7 +1624,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1503 device->dev_root = root->fs_info->dev_root; 1624 device->dev_root = root->fs_info->dev_root;
1504 device->bdev = bdev; 1625 device->bdev = bdev;
1505 device->in_fs_metadata = 1; 1626 device->in_fs_metadata = 1;
1506 device->mode = 0; 1627 device->mode = FMODE_EXCL;
1507 set_blocksize(device->bdev, 4096); 1628 set_blocksize(device->bdev, 4096);
1508 1629
1509 if (seeding_dev) { 1630 if (seeding_dev) {
@@ -1519,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1519 * half setup 1640 * half setup
1520 */ 1641 */
1521 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 1642 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1522 list_add(&device->dev_list, &root->fs_info->fs_devices->devices); 1643 list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
1523 list_add(&device->dev_alloc_list, 1644 list_add(&device->dev_alloc_list,
1524 &root->fs_info->fs_devices->alloc_list); 1645 &root->fs_info->fs_devices->alloc_list);
1525 root->fs_info->fs_devices->num_devices++; 1646 root->fs_info->fs_devices->num_devices++;
@@ -1568,7 +1689,7 @@ out:
1568 mutex_unlock(&root->fs_info->volume_mutex); 1689 mutex_unlock(&root->fs_info->volume_mutex);
1569 return ret; 1690 return ret;
1570error: 1691error:
1571 close_bdev_exclusive(bdev, 0); 1692 blkdev_put(bdev, FMODE_EXCL);
1572 if (seeding_dev) { 1693 if (seeding_dev) {
1573 mutex_unlock(&uuid_mutex); 1694 mutex_unlock(&uuid_mutex);
1574 up_write(&sb->s_umount); 1695 up_write(&sb->s_umount);
@@ -1677,10 +1798,9 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1677 BUG_ON(ret); 1798 BUG_ON(ret);
1678 1799
1679 ret = btrfs_del_item(trans, root, path); 1800 ret = btrfs_del_item(trans, root, path);
1680 BUG_ON(ret);
1681 1801
1682 btrfs_free_path(path); 1802 btrfs_free_path(path);
1683 return 0; 1803 return ret;
1684} 1804}
1685 1805
1686static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1806static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
@@ -1755,7 +1875,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1755 return ret; 1875 return ret;
1756 1876
1757 trans = btrfs_start_transaction(root, 0); 1877 trans = btrfs_start_transaction(root, 0);
1758 BUG_ON(!trans); 1878 BUG_ON(IS_ERR(trans));
1759 1879
1760 lock_chunks(root); 1880 lock_chunks(root);
1761 1881
@@ -1786,6 +1906,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
1786 1906
1787 BUG_ON(ret); 1907 BUG_ON(ret);
1788 1908
1909 trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
1910
1789 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { 1911 if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
1790 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); 1912 ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
1791 BUG_ON(ret); 1913 BUG_ON(ret);
@@ -1853,7 +1975,7 @@ again:
1853 chunk = btrfs_item_ptr(leaf, path->slots[0], 1975 chunk = btrfs_item_ptr(leaf, path->slots[0],
1854 struct btrfs_chunk); 1976 struct btrfs_chunk);
1855 chunk_type = btrfs_chunk_type(leaf, chunk); 1977 chunk_type = btrfs_chunk_type(leaf, chunk);
1856 btrfs_release_path(chunk_root, path); 1978 btrfs_release_path(path);
1857 1979
1858 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { 1980 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
1859 ret = btrfs_relocate_chunk(chunk_root, chunk_tree, 1981 ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
@@ -1901,7 +2023,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
1901 u64 size_to_free; 2023 u64 size_to_free;
1902 struct btrfs_path *path; 2024 struct btrfs_path *path;
1903 struct btrfs_key key; 2025 struct btrfs_key key;
1904 struct btrfs_chunk *chunk;
1905 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; 2026 struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
1906 struct btrfs_trans_handle *trans; 2027 struct btrfs_trans_handle *trans;
1907 struct btrfs_key found_key; 2028 struct btrfs_key found_key;
@@ -1909,6 +2030,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
1909 if (dev_root->fs_info->sb->s_flags & MS_RDONLY) 2030 if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
1910 return -EROFS; 2031 return -EROFS;
1911 2032
2033 if (!capable(CAP_SYS_ADMIN))
2034 return -EPERM;
2035
1912 mutex_lock(&dev_root->fs_info->volume_mutex); 2036 mutex_lock(&dev_root->fs_info->volume_mutex);
1913 dev_root = dev_root->fs_info->dev_root; 2037 dev_root = dev_root->fs_info->dev_root;
1914 2038
@@ -1927,7 +2051,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
1927 BUG_ON(ret); 2051 BUG_ON(ret);
1928 2052
1929 trans = btrfs_start_transaction(dev_root, 0); 2053 trans = btrfs_start_transaction(dev_root, 0);
1930 BUG_ON(!trans); 2054 BUG_ON(IS_ERR(trans));
1931 2055
1932 ret = btrfs_grow_device(trans, device, old_size); 2056 ret = btrfs_grow_device(trans, device, old_size);
1933 BUG_ON(ret); 2057 BUG_ON(ret);
@@ -1965,19 +2089,17 @@ int btrfs_balance(struct btrfs_root *dev_root)
1965 if (found_key.objectid != key.objectid) 2089 if (found_key.objectid != key.objectid)
1966 break; 2090 break;
1967 2091
1968 chunk = btrfs_item_ptr(path->nodes[0],
1969 path->slots[0],
1970 struct btrfs_chunk);
1971 /* chunk zero is special */ 2092 /* chunk zero is special */
1972 if (found_key.offset == 0) 2093 if (found_key.offset == 0)
1973 break; 2094 break;
1974 2095
1975 btrfs_release_path(chunk_root, path); 2096 btrfs_release_path(path);
1976 ret = btrfs_relocate_chunk(chunk_root, 2097 ret = btrfs_relocate_chunk(chunk_root,
1977 chunk_root->root_key.objectid, 2098 chunk_root->root_key.objectid,
1978 found_key.objectid, 2099 found_key.objectid,
1979 found_key.offset); 2100 found_key.offset);
1980 BUG_ON(ret && ret != -ENOSPC); 2101 if (ret && ret != -ENOSPC)
2102 goto error;
1981 key.offset = found_key.offset - 1; 2103 key.offset = found_key.offset - 1;
1982 } 2104 }
1983 ret = 0; 2105 ret = 0;
@@ -2044,7 +2166,7 @@ again:
2044 goto done; 2166 goto done;
2045 if (ret) { 2167 if (ret) {
2046 ret = 0; 2168 ret = 0;
2047 btrfs_release_path(root, path); 2169 btrfs_release_path(path);
2048 break; 2170 break;
2049 } 2171 }
2050 2172
@@ -2053,7 +2175,7 @@ again:
2053 btrfs_item_key_to_cpu(l, &key, path->slots[0]); 2175 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
2054 2176
2055 if (key.objectid != device->devid) { 2177 if (key.objectid != device->devid) {
2056 btrfs_release_path(root, path); 2178 btrfs_release_path(path);
2057 break; 2179 break;
2058 } 2180 }
2059 2181
@@ -2061,14 +2183,14 @@ again:
2061 length = btrfs_dev_extent_length(l, dev_extent); 2183 length = btrfs_dev_extent_length(l, dev_extent);
2062 2184
2063 if (key.offset + length <= new_size) { 2185 if (key.offset + length <= new_size) {
2064 btrfs_release_path(root, path); 2186 btrfs_release_path(path);
2065 break; 2187 break;
2066 } 2188 }
2067 2189
2068 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); 2190 chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2069 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); 2191 chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2070 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); 2192 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2071 btrfs_release_path(root, path); 2193 btrfs_release_path(path);
2072 2194
2073 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, 2195 ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
2074 chunk_offset); 2196 chunk_offset);
@@ -2096,6 +2218,11 @@ again:
2096 2218
2097 /* Shrinking succeeded, else we would be at "done". */ 2219 /* Shrinking succeeded, else we would be at "done". */
2098 trans = btrfs_start_transaction(root, 0); 2220 trans = btrfs_start_transaction(root, 0);
2221 if (IS_ERR(trans)) {
2222 ret = PTR_ERR(trans);
2223 goto done;
2224 }
2225
2099 lock_chunks(root); 2226 lock_chunks(root);
2100 2227
2101 device->disk_total_bytes = new_size; 2228 device->disk_total_bytes = new_size;
@@ -2139,211 +2266,243 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2139 return 0; 2266 return 0;
2140} 2267}
2141 2268
2142static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, 2269/*
2143 int num_stripes, int sub_stripes) 2270 * sort the devices in descending order by max_avail, total_avail
2271 */
2272static int btrfs_cmp_device_info(const void *a, const void *b)
2144{ 2273{
2145 if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) 2274 const struct btrfs_device_info *di_a = a;
2146 return calc_size; 2275 const struct btrfs_device_info *di_b = b;
2147 else if (type & BTRFS_BLOCK_GROUP_RAID10) 2276
2148 return calc_size * (num_stripes / sub_stripes); 2277 if (di_a->max_avail > di_b->max_avail)
2149 else 2278 return -1;
2150 return calc_size * num_stripes; 2279 if (di_a->max_avail < di_b->max_avail)
2280 return 1;
2281 if (di_a->total_avail > di_b->total_avail)
2282 return -1;
2283 if (di_a->total_avail < di_b->total_avail)
2284 return 1;
2285 return 0;
2151} 2286}
2152 2287
2153static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, 2288static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
2154 struct btrfs_root *extent_root, 2289 struct btrfs_root *extent_root,
2155 struct map_lookup **map_ret, 2290 struct map_lookup **map_ret,
2156 u64 *num_bytes, u64 *stripe_size, 2291 u64 *num_bytes_out, u64 *stripe_size_out,
2157 u64 start, u64 type) 2292 u64 start, u64 type)
2158{ 2293{
2159 struct btrfs_fs_info *info = extent_root->fs_info; 2294 struct btrfs_fs_info *info = extent_root->fs_info;
2160 struct btrfs_device *device = NULL;
2161 struct btrfs_fs_devices *fs_devices = info->fs_devices; 2295 struct btrfs_fs_devices *fs_devices = info->fs_devices;
2162 struct list_head *cur; 2296 struct list_head *cur;
2163 struct map_lookup *map = NULL; 2297 struct map_lookup *map = NULL;
2164 struct extent_map_tree *em_tree; 2298 struct extent_map_tree *em_tree;
2165 struct extent_map *em; 2299 struct extent_map *em;
2166 struct list_head private_devs; 2300 struct btrfs_device_info *devices_info = NULL;
2167 int min_stripe_size = 1 * 1024 * 1024; 2301 u64 total_avail;
2168 u64 calc_size = 1024 * 1024 * 1024; 2302 int num_stripes; /* total number of stripes to allocate */
2169 u64 max_chunk_size = calc_size; 2303 int sub_stripes; /* sub_stripes info for map */
2170 u64 min_free; 2304 int dev_stripes; /* stripes per dev */
2171 u64 avail; 2305 int devs_max; /* max devs to use */
2172 u64 max_avail = 0; 2306 int devs_min; /* min devs needed */
2173 u64 dev_offset; 2307 int devs_increment; /* ndevs has to be a multiple of this */
2174 int num_stripes = 1; 2308 int ncopies; /* how many copies to data has */
2175 int min_stripes = 1;
2176 int sub_stripes = 0;
2177 int looped = 0;
2178 int ret; 2309 int ret;
2179 int index; 2310 u64 max_stripe_size;
2180 int stripe_len = 64 * 1024; 2311 u64 max_chunk_size;
2312 u64 stripe_size;
2313 u64 num_bytes;
2314 int ndevs;
2315 int i;
2316 int j;
2181 2317
2182 if ((type & BTRFS_BLOCK_GROUP_RAID1) && 2318 if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
2183 (type & BTRFS_BLOCK_GROUP_DUP)) { 2319 (type & BTRFS_BLOCK_GROUP_DUP)) {
2184 WARN_ON(1); 2320 WARN_ON(1);
2185 type &= ~BTRFS_BLOCK_GROUP_DUP; 2321 type &= ~BTRFS_BLOCK_GROUP_DUP;
2186 } 2322 }
2323
2187 if (list_empty(&fs_devices->alloc_list)) 2324 if (list_empty(&fs_devices->alloc_list))
2188 return -ENOSPC; 2325 return -ENOSPC;
2189 2326
2190 if (type & (BTRFS_BLOCK_GROUP_RAID0)) { 2327 sub_stripes = 1;
2191 num_stripes = fs_devices->rw_devices; 2328 dev_stripes = 1;
2192 min_stripes = 2; 2329 devs_increment = 1;
2193 } 2330 ncopies = 1;
2331 devs_max = 0; /* 0 == as many as possible */
2332 devs_min = 1;
2333
2334 /*
2335 * define the properties of each RAID type.
2336 * FIXME: move this to a global table and use it in all RAID
2337 * calculation code
2338 */
2194 if (type & (BTRFS_BLOCK_GROUP_DUP)) { 2339 if (type & (BTRFS_BLOCK_GROUP_DUP)) {
2195 num_stripes = 2; 2340 dev_stripes = 2;
2196 min_stripes = 2; 2341 ncopies = 2;
2197 } 2342 devs_max = 1;
2198 if (type & (BTRFS_BLOCK_GROUP_RAID1)) { 2343 } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
2199 if (fs_devices->rw_devices < 2) 2344 devs_min = 2;
2200 return -ENOSPC; 2345 } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
2201 num_stripes = 2; 2346 devs_increment = 2;
2202 min_stripes = 2; 2347 ncopies = 2;
2203 } 2348 devs_max = 2;
2204 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2349 devs_min = 2;
2205 num_stripes = fs_devices->rw_devices; 2350 } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
2206 if (num_stripes < 4)
2207 return -ENOSPC;
2208 num_stripes &= ~(u32)1;
2209 sub_stripes = 2; 2351 sub_stripes = 2;
2210 min_stripes = 4; 2352 devs_increment = 2;
2353 ncopies = 2;
2354 devs_min = 4;
2355 } else {
2356 devs_max = 1;
2211 } 2357 }
2212 2358
2213 if (type & BTRFS_BLOCK_GROUP_DATA) { 2359 if (type & BTRFS_BLOCK_GROUP_DATA) {
2214 max_chunk_size = 10 * calc_size; 2360 max_stripe_size = 1024 * 1024 * 1024;
2215 min_stripe_size = 64 * 1024 * 1024; 2361 max_chunk_size = 10 * max_stripe_size;
2216 } else if (type & BTRFS_BLOCK_GROUP_METADATA) { 2362 } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
2217 max_chunk_size = 256 * 1024 * 1024; 2363 max_stripe_size = 256 * 1024 * 1024;
2218 min_stripe_size = 32 * 1024 * 1024; 2364 max_chunk_size = max_stripe_size;
2219 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { 2365 } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
2220 calc_size = 8 * 1024 * 1024; 2366 max_stripe_size = 8 * 1024 * 1024;
2221 max_chunk_size = calc_size * 2; 2367 max_chunk_size = 2 * max_stripe_size;
2222 min_stripe_size = 1 * 1024 * 1024; 2368 } else {
2369 printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
2370 type);
2371 BUG_ON(1);
2223 } 2372 }
2224 2373
2225 /* we don't want a chunk larger than 10% of writeable space */ 2374 /* we don't want a chunk larger than 10% of writeable space */
2226 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), 2375 max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
2227 max_chunk_size); 2376 max_chunk_size);
2228 2377
2229again: 2378 devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
2230 max_avail = 0; 2379 GFP_NOFS);
2231 if (!map || map->num_stripes != num_stripes) { 2380 if (!devices_info)
2232 kfree(map); 2381 return -ENOMEM;
2233 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2234 if (!map)
2235 return -ENOMEM;
2236 map->num_stripes = num_stripes;
2237 }
2238
2239 if (calc_size * num_stripes > max_chunk_size) {
2240 calc_size = max_chunk_size;
2241 do_div(calc_size, num_stripes);
2242 do_div(calc_size, stripe_len);
2243 calc_size *= stripe_len;
2244 }
2245 2382
2246 /* we don't want tiny stripes */ 2383 cur = fs_devices->alloc_list.next;
2247 if (!looped)
2248 calc_size = max_t(u64, min_stripe_size, calc_size);
2249 2384
2250 /* 2385 /*
2251 * we're about to do_div by the stripe_len so lets make sure 2386 * in the first pass through the devices list, we gather information
2252 * we end up with something bigger than a stripe 2387 * about the available holes on each device.
2253 */ 2388 */
2254 calc_size = max_t(u64, calc_size, stripe_len * 4); 2389 ndevs = 0;
2390 while (cur != &fs_devices->alloc_list) {
2391 struct btrfs_device *device;
2392 u64 max_avail;
2393 u64 dev_offset;
2255 2394
2256 do_div(calc_size, stripe_len); 2395 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2257 calc_size *= stripe_len;
2258 2396
2259 cur = fs_devices->alloc_list.next; 2397 cur = cur->next;
2260 index = 0;
2261 2398
2262 if (type & BTRFS_BLOCK_GROUP_DUP) 2399 if (!device->writeable) {
2263 min_free = calc_size * 2; 2400 printk(KERN_ERR
2264 else 2401 "btrfs: read-only device in alloc_list\n");
2265 min_free = calc_size; 2402 WARN_ON(1);
2403 continue;
2404 }
2266 2405
2267 /* 2406 if (!device->in_fs_metadata)
2268 * we add 1MB because we never use the first 1MB of the device, unless 2407 continue;
2269 * we've looped, then we are likely allocating the maximum amount of
2270 * space left already
2271 */
2272 if (!looped)
2273 min_free += 1024 * 1024;
2274 2408
2275 INIT_LIST_HEAD(&private_devs);
2276 while (index < num_stripes) {
2277 device = list_entry(cur, struct btrfs_device, dev_alloc_list);
2278 BUG_ON(!device->writeable);
2279 if (device->total_bytes > device->bytes_used) 2409 if (device->total_bytes > device->bytes_used)
2280 avail = device->total_bytes - device->bytes_used; 2410 total_avail = device->total_bytes - device->bytes_used;
2281 else 2411 else
2282 avail = 0; 2412 total_avail = 0;
2283 cur = cur->next; 2413 /* avail is off by max(alloc_start, 1MB), but that is the same
2414 * for all devices, so it doesn't hurt the sorting later on
2415 */
2284 2416
2285 if (device->in_fs_metadata && avail >= min_free) { 2417 ret = find_free_dev_extent(trans, device,
2286 ret = find_free_dev_extent(trans, device, 2418 max_stripe_size * dev_stripes,
2287 min_free, &dev_offset, 2419 &dev_offset, &max_avail);
2288 &max_avail); 2420 if (ret && ret != -ENOSPC)
2289 if (ret == 0) { 2421 goto error;
2290 list_move_tail(&device->dev_alloc_list, 2422
2291 &private_devs); 2423 if (ret == 0)
2292 map->stripes[index].dev = device; 2424 max_avail = max_stripe_size * dev_stripes;
2293 map->stripes[index].physical = dev_offset; 2425
2294 index++; 2426 if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
2295 if (type & BTRFS_BLOCK_GROUP_DUP) { 2427 continue;
2296 map->stripes[index].dev = device; 2428
2297 map->stripes[index].physical = 2429 devices_info[ndevs].dev_offset = dev_offset;
2298 dev_offset + calc_size; 2430 devices_info[ndevs].max_avail = max_avail;
2299 index++; 2431 devices_info[ndevs].total_avail = total_avail;
2300 } 2432 devices_info[ndevs].dev = device;
2301 } 2433 ++ndevs;
2302 } else if (device->in_fs_metadata && avail > max_avail)
2303 max_avail = avail;
2304 if (cur == &fs_devices->alloc_list)
2305 break;
2306 } 2434 }
2307 list_splice(&private_devs, &fs_devices->alloc_list); 2435
2308 if (index < num_stripes) { 2436 /*
2309 if (index >= min_stripes) { 2437 * now sort the devices by hole size / available space
2310 num_stripes = index; 2438 */
2311 if (type & (BTRFS_BLOCK_GROUP_RAID10)) { 2439 sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
2312 num_stripes /= sub_stripes; 2440 btrfs_cmp_device_info, NULL);
2313 num_stripes *= sub_stripes; 2441
2314 } 2442 /* round down to number of usable stripes */
2315 looped = 1; 2443 ndevs -= ndevs % devs_increment;
2316 goto again; 2444
2317 } 2445 if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
2318 if (!looped && max_avail > 0) { 2446 ret = -ENOSPC;
2319 looped = 1; 2447 goto error;
2320 calc_size = max_avail; 2448 }
2321 goto again; 2449
2450 if (devs_max && ndevs > devs_max)
2451 ndevs = devs_max;
2452 /*
2453 * the primary goal is to maximize the number of stripes, so use as many
2454 * devices as possible, even if the stripes are not maximum sized.
2455 */
2456 stripe_size = devices_info[ndevs-1].max_avail;
2457 num_stripes = ndevs * dev_stripes;
2458
2459 if (stripe_size * num_stripes > max_chunk_size * ncopies) {
2460 stripe_size = max_chunk_size * ncopies;
2461 do_div(stripe_size, num_stripes);
2462 }
2463
2464 do_div(stripe_size, dev_stripes);
2465 do_div(stripe_size, BTRFS_STRIPE_LEN);
2466 stripe_size *= BTRFS_STRIPE_LEN;
2467
2468 map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
2469 if (!map) {
2470 ret = -ENOMEM;
2471 goto error;
2472 }
2473 map->num_stripes = num_stripes;
2474
2475 for (i = 0; i < ndevs; ++i) {
2476 for (j = 0; j < dev_stripes; ++j) {
2477 int s = i * dev_stripes + j;
2478 map->stripes[s].dev = devices_info[i].dev;
2479 map->stripes[s].physical = devices_info[i].dev_offset +
2480 j * stripe_size;
2322 } 2481 }
2323 kfree(map);
2324 return -ENOSPC;
2325 } 2482 }
2326 map->sector_size = extent_root->sectorsize; 2483 map->sector_size = extent_root->sectorsize;
2327 map->stripe_len = stripe_len; 2484 map->stripe_len = BTRFS_STRIPE_LEN;
2328 map->io_align = stripe_len; 2485 map->io_align = BTRFS_STRIPE_LEN;
2329 map->io_width = stripe_len; 2486 map->io_width = BTRFS_STRIPE_LEN;
2330 map->type = type; 2487 map->type = type;
2331 map->num_stripes = num_stripes;
2332 map->sub_stripes = sub_stripes; 2488 map->sub_stripes = sub_stripes;
2333 2489
2334 *map_ret = map; 2490 *map_ret = map;
2335 *stripe_size = calc_size; 2491 num_bytes = stripe_size * (num_stripes / ncopies);
2336 *num_bytes = chunk_bytes_by_type(type, calc_size, 2492
2337 num_stripes, sub_stripes); 2493 *stripe_size_out = stripe_size;
2494 *num_bytes_out = num_bytes;
2338 2495
2339 em = alloc_extent_map(GFP_NOFS); 2496 trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
2497
2498 em = alloc_extent_map();
2340 if (!em) { 2499 if (!em) {
2341 kfree(map); 2500 ret = -ENOMEM;
2342 return -ENOMEM; 2501 goto error;
2343 } 2502 }
2344 em->bdev = (struct block_device *)map; 2503 em->bdev = (struct block_device *)map;
2345 em->start = start; 2504 em->start = start;
2346 em->len = *num_bytes; 2505 em->len = num_bytes;
2347 em->block_start = 0; 2506 em->block_start = 0;
2348 em->block_len = em->len; 2507 em->block_len = em->len;
2349 2508
@@ -2356,23 +2515,30 @@ again:
2356 2515
2357 ret = btrfs_make_block_group(trans, extent_root, 0, type, 2516 ret = btrfs_make_block_group(trans, extent_root, 0, type,
2358 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2517 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2359 start, *num_bytes); 2518 start, num_bytes);
2360 BUG_ON(ret); 2519 BUG_ON(ret);
2361 2520
2362 index = 0; 2521 for (i = 0; i < map->num_stripes; ++i) {
2363 while (index < map->num_stripes) { 2522 struct btrfs_device *device;
2364 device = map->stripes[index].dev; 2523 u64 dev_offset;
2365 dev_offset = map->stripes[index].physical; 2524
2525 device = map->stripes[i].dev;
2526 dev_offset = map->stripes[i].physical;
2366 2527
2367 ret = btrfs_alloc_dev_extent(trans, device, 2528 ret = btrfs_alloc_dev_extent(trans, device,
2368 info->chunk_root->root_key.objectid, 2529 info->chunk_root->root_key.objectid,
2369 BTRFS_FIRST_CHUNK_TREE_OBJECTID, 2530 BTRFS_FIRST_CHUNK_TREE_OBJECTID,
2370 start, dev_offset, calc_size); 2531 start, dev_offset, stripe_size);
2371 BUG_ON(ret); 2532 BUG_ON(ret);
2372 index++;
2373 } 2533 }
2374 2534
2535 kfree(devices_info);
2375 return 0; 2536 return 0;
2537
2538error:
2539 kfree(map);
2540 kfree(devices_info);
2541 return ret;
2376} 2542}
2377 2543
2378static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, 2544static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -2438,6 +2604,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2438 item_size); 2604 item_size);
2439 BUG_ON(ret); 2605 BUG_ON(ret);
2440 } 2606 }
2607
2441 kfree(chunk); 2608 kfree(chunk);
2442 return 0; 2609 return 0;
2443} 2610}
@@ -2569,7 +2736,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
2569 2736
2570void btrfs_mapping_init(struct btrfs_mapping_tree *tree) 2737void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
2571{ 2738{
2572 extent_map_tree_init(&tree->map_tree, GFP_NOFS); 2739 extent_map_tree_init(&tree->map_tree);
2573} 2740}
2574 2741
2575void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) 2742void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
@@ -2635,14 +2802,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2635static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2802static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2636 u64 logical, u64 *length, 2803 u64 logical, u64 *length,
2637 struct btrfs_multi_bio **multi_ret, 2804 struct btrfs_multi_bio **multi_ret,
2638 int mirror_num, struct page *unplug_page) 2805 int mirror_num)
2639{ 2806{
2640 struct extent_map *em; 2807 struct extent_map *em;
2641 struct map_lookup *map; 2808 struct map_lookup *map;
2642 struct extent_map_tree *em_tree = &map_tree->map_tree; 2809 struct extent_map_tree *em_tree = &map_tree->map_tree;
2643 u64 offset; 2810 u64 offset;
2644 u64 stripe_offset; 2811 u64 stripe_offset;
2812 u64 stripe_end_offset;
2645 u64 stripe_nr; 2813 u64 stripe_nr;
2814 u64 stripe_nr_orig;
2815 u64 stripe_nr_end;
2646 int stripes_allocated = 8; 2816 int stripes_allocated = 8;
2647 int stripes_required = 1; 2817 int stripes_required = 1;
2648 int stripe_index; 2818 int stripe_index;
@@ -2651,7 +2821,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2651 int max_errors = 0; 2821 int max_errors = 0;
2652 struct btrfs_multi_bio *multi = NULL; 2822 struct btrfs_multi_bio *multi = NULL;
2653 2823
2654 if (multi_ret && !(rw & REQ_WRITE)) 2824 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2655 stripes_allocated = 1; 2825 stripes_allocated = 1;
2656again: 2826again:
2657 if (multi_ret) { 2827 if (multi_ret) {
@@ -2667,11 +2837,6 @@ again:
2667 em = lookup_extent_mapping(em_tree, logical, *length); 2837 em = lookup_extent_mapping(em_tree, logical, *length);
2668 read_unlock(&em_tree->lock); 2838 read_unlock(&em_tree->lock);
2669 2839
2670 if (!em && unplug_page) {
2671 kfree(multi);
2672 return 0;
2673 }
2674
2675 if (!em) { 2840 if (!em) {
2676 printk(KERN_CRIT "unable to find logical %llu len %llu\n", 2841 printk(KERN_CRIT "unable to find logical %llu len %llu\n",
2677 (unsigned long long)logical, 2842 (unsigned long long)logical,
@@ -2697,7 +2862,15 @@ again:
2697 max_errors = 1; 2862 max_errors = 1;
2698 } 2863 }
2699 } 2864 }
2700 if (multi_ret && (rw & REQ_WRITE) && 2865 if (rw & REQ_DISCARD) {
2866 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2867 BTRFS_BLOCK_GROUP_RAID1 |
2868 BTRFS_BLOCK_GROUP_DUP |
2869 BTRFS_BLOCK_GROUP_RAID10)) {
2870 stripes_required = map->num_stripes;
2871 }
2872 }
2873 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2701 stripes_allocated < stripes_required) { 2874 stripes_allocated < stripes_required) {
2702 stripes_allocated = map->num_stripes; 2875 stripes_allocated = map->num_stripes;
2703 free_extent_map(em); 2876 free_extent_map(em);
@@ -2717,23 +2890,37 @@ again:
2717 /* stripe_offset is the offset of this block in its stripe*/ 2890 /* stripe_offset is the offset of this block in its stripe*/
2718 stripe_offset = offset - stripe_offset; 2891 stripe_offset = offset - stripe_offset;
2719 2892
2720 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | 2893 if (rw & REQ_DISCARD)
2721 BTRFS_BLOCK_GROUP_RAID10 | 2894 *length = min_t(u64, em->len - offset, *length);
2722 BTRFS_BLOCK_GROUP_DUP)) { 2895 else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
2896 BTRFS_BLOCK_GROUP_RAID1 |
2897 BTRFS_BLOCK_GROUP_RAID10 |
2898 BTRFS_BLOCK_GROUP_DUP)) {
2723 /* we limit the length of each bio to what fits in a stripe */ 2899 /* we limit the length of each bio to what fits in a stripe */
2724 *length = min_t(u64, em->len - offset, 2900 *length = min_t(u64, em->len - offset,
2725 map->stripe_len - stripe_offset); 2901 map->stripe_len - stripe_offset);
2726 } else { 2902 } else {
2727 *length = em->len - offset; 2903 *length = em->len - offset;
2728 } 2904 }
2729 2905
2730 if (!multi_ret && !unplug_page) 2906 if (!multi_ret)
2731 goto out; 2907 goto out;
2732 2908
2733 num_stripes = 1; 2909 num_stripes = 1;
2734 stripe_index = 0; 2910 stripe_index = 0;
2735 if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 2911 stripe_nr_orig = stripe_nr;
2736 if (unplug_page || (rw & REQ_WRITE)) 2912 stripe_nr_end = (offset + *length + map->stripe_len - 1) &
2913 (~(map->stripe_len - 1));
2914 do_div(stripe_nr_end, map->stripe_len);
2915 stripe_end_offset = stripe_nr_end * map->stripe_len -
2916 (offset + *length);
2917 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2918 if (rw & REQ_DISCARD)
2919 num_stripes = min_t(u64, map->num_stripes,
2920 stripe_nr_end - stripe_nr_orig);
2921 stripe_index = do_div(stripe_nr, map->num_stripes);
2922 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2923 if (rw & (REQ_WRITE | REQ_DISCARD))
2737 num_stripes = map->num_stripes; 2924 num_stripes = map->num_stripes;
2738 else if (mirror_num) 2925 else if (mirror_num)
2739 stripe_index = mirror_num - 1; 2926 stripe_index = mirror_num - 1;
@@ -2744,7 +2931,7 @@ again:
2744 } 2931 }
2745 2932
2746 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 2933 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2747 if (rw & REQ_WRITE) 2934 if (rw & (REQ_WRITE | REQ_DISCARD))
2748 num_stripes = map->num_stripes; 2935 num_stripes = map->num_stripes;
2749 else if (mirror_num) 2936 else if (mirror_num)
2750 stripe_index = mirror_num - 1; 2937 stripe_index = mirror_num - 1;
@@ -2755,8 +2942,12 @@ again:
2755 stripe_index = do_div(stripe_nr, factor); 2942 stripe_index = do_div(stripe_nr, factor);
2756 stripe_index *= map->sub_stripes; 2943 stripe_index *= map->sub_stripes;
2757 2944
2758 if (unplug_page || (rw & REQ_WRITE)) 2945 if (rw & REQ_WRITE)
2759 num_stripes = map->sub_stripes; 2946 num_stripes = map->sub_stripes;
2947 else if (rw & REQ_DISCARD)
2948 num_stripes = min_t(u64, map->sub_stripes *
2949 (stripe_nr_end - stripe_nr_orig),
2950 map->num_stripes);
2760 else if (mirror_num) 2951 else if (mirror_num)
2761 stripe_index += mirror_num - 1; 2952 stripe_index += mirror_num - 1;
2762 else { 2953 else {
@@ -2774,24 +2965,101 @@ again:
2774 } 2965 }
2775 BUG_ON(stripe_index >= map->num_stripes); 2966 BUG_ON(stripe_index >= map->num_stripes);
2776 2967
2777 for (i = 0; i < num_stripes; i++) { 2968 if (rw & REQ_DISCARD) {
2778 if (unplug_page) { 2969 for (i = 0; i < num_stripes; i++) {
2779 struct btrfs_device *device;
2780 struct backing_dev_info *bdi;
2781
2782 device = map->stripes[stripe_index].dev;
2783 if (device->bdev) {
2784 bdi = blk_get_backing_dev_info(device->bdev);
2785 if (bdi->unplug_io_fn)
2786 bdi->unplug_io_fn(bdi, unplug_page);
2787 }
2788 } else {
2789 multi->stripes[i].physical = 2970 multi->stripes[i].physical =
2790 map->stripes[stripe_index].physical + 2971 map->stripes[stripe_index].physical +
2791 stripe_offset + stripe_nr * map->stripe_len; 2972 stripe_offset + stripe_nr * map->stripe_len;
2792 multi->stripes[i].dev = map->stripes[stripe_index].dev; 2973 multi->stripes[i].dev = map->stripes[stripe_index].dev;
2974
2975 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2976 u64 stripes;
2977 u32 last_stripe = 0;
2978 int j;
2979
2980 div_u64_rem(stripe_nr_end - 1,
2981 map->num_stripes,
2982 &last_stripe);
2983
2984 for (j = 0; j < map->num_stripes; j++) {
2985 u32 test;
2986
2987 div_u64_rem(stripe_nr_end - 1 - j,
2988 map->num_stripes, &test);
2989 if (test == stripe_index)
2990 break;
2991 }
2992 stripes = stripe_nr_end - 1 - j;
2993 do_div(stripes, map->num_stripes);
2994 multi->stripes[i].length = map->stripe_len *
2995 (stripes - stripe_nr + 1);
2996
2997 if (i == 0) {
2998 multi->stripes[i].length -=
2999 stripe_offset;
3000 stripe_offset = 0;
3001 }
3002 if (stripe_index == last_stripe)
3003 multi->stripes[i].length -=
3004 stripe_end_offset;
3005 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3006 u64 stripes;
3007 int j;
3008 int factor = map->num_stripes /
3009 map->sub_stripes;
3010 u32 last_stripe = 0;
3011
3012 div_u64_rem(stripe_nr_end - 1,
3013 factor, &last_stripe);
3014 last_stripe *= map->sub_stripes;
3015
3016 for (j = 0; j < factor; j++) {
3017 u32 test;
3018
3019 div_u64_rem(stripe_nr_end - 1 - j,
3020 factor, &test);
3021
3022 if (test ==
3023 stripe_index / map->sub_stripes)
3024 break;
3025 }
3026 stripes = stripe_nr_end - 1 - j;
3027 do_div(stripes, factor);
3028 multi->stripes[i].length = map->stripe_len *
3029 (stripes - stripe_nr + 1);
3030
3031 if (i < map->sub_stripes) {
3032 multi->stripes[i].length -=
3033 stripe_offset;
3034 if (i == map->sub_stripes - 1)
3035 stripe_offset = 0;
3036 }
3037 if (stripe_index >= last_stripe &&
3038 stripe_index <= (last_stripe +
3039 map->sub_stripes - 1)) {
3040 multi->stripes[i].length -=
3041 stripe_end_offset;
3042 }
3043 } else
3044 multi->stripes[i].length = *length;
3045
3046 stripe_index++;
3047 if (stripe_index == map->num_stripes) {
3048 /* This could only happen for RAID0/10 */
3049 stripe_index = 0;
3050 stripe_nr++;
3051 }
3052 }
3053 } else {
3054 for (i = 0; i < num_stripes; i++) {
3055 multi->stripes[i].physical =
3056 map->stripes[stripe_index].physical +
3057 stripe_offset +
3058 stripe_nr * map->stripe_len;
3059 multi->stripes[i].dev =
3060 map->stripes[stripe_index].dev;
3061 stripe_index++;
2793 } 3062 }
2794 stripe_index++;
2795 } 3063 }
2796 if (multi_ret) { 3064 if (multi_ret) {
2797 *multi_ret = multi; 3065 *multi_ret = multi;
@@ -2808,7 +3076,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2808 struct btrfs_multi_bio **multi_ret, int mirror_num) 3076 struct btrfs_multi_bio **multi_ret, int mirror_num)
2809{ 3077{
2810 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3078 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
2811 mirror_num, NULL); 3079 mirror_num);
2812} 3080}
2813 3081
2814int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 3082int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -2876,14 +3144,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
2876 return 0; 3144 return 0;
2877} 3145}
2878 3146
2879int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
2880 u64 logical, struct page *page)
2881{
2882 u64 length = PAGE_CACHE_SIZE;
2883 return __btrfs_map_block(map_tree, READ, logical, &length,
2884 NULL, 0, page);
2885}
2886
2887static void end_bio_multi_stripe(struct bio *bio, int err) 3147static void end_bio_multi_stripe(struct bio *bio, int err)
2888{ 3148{
2889 struct btrfs_multi_bio *multi = bio->bi_private; 3149 struct btrfs_multi_bio *multi = bio->bi_private;
@@ -3034,8 +3294,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3034 } 3294 }
3035 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3295 bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
3036 dev = multi->stripes[dev_nr].dev; 3296 dev = multi->stripes[dev_nr].dev;
3037 BUG_ON(rw == WRITE && !dev->writeable); 3297 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3038 if (dev && dev->bdev) {
3039 bio->bi_bdev = dev->bdev; 3298 bio->bi_bdev = dev->bdev;
3040 if (async_submit) 3299 if (async_submit)
3041 schedule_bio(root, dev, rw, bio); 3300 schedule_bio(root, dev, rw, bio);
@@ -3084,12 +3343,13 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
3084 return NULL; 3343 return NULL;
3085 list_add(&device->dev_list, 3344 list_add(&device->dev_list,
3086 &fs_devices->devices); 3345 &fs_devices->devices);
3087 device->barriers = 1;
3088 device->dev_root = root->fs_info->dev_root; 3346 device->dev_root = root->fs_info->dev_root;
3089 device->devid = devid; 3347 device->devid = devid;
3090 device->work.func = pending_bios_fn; 3348 device->work.func = pending_bios_fn;
3091 device->fs_devices = fs_devices; 3349 device->fs_devices = fs_devices;
3350 device->missing = 1;
3092 fs_devices->num_devices++; 3351 fs_devices->num_devices++;
3352 fs_devices->missing_devices++;
3093 spin_lock_init(&device->io_lock); 3353 spin_lock_init(&device->io_lock);
3094 INIT_LIST_HEAD(&device->dev_alloc_list); 3354 INIT_LIST_HEAD(&device->dev_alloc_list);
3095 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); 3355 memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3126,7 +3386,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
3126 free_extent_map(em); 3386 free_extent_map(em);
3127 } 3387 }
3128 3388
3129 em = alloc_extent_map(GFP_NOFS); 3389 em = alloc_extent_map();
3130 if (!em) 3390 if (!em)
3131 return -ENOMEM; 3391 return -ENOMEM;
3132 num_stripes = btrfs_chunk_num_stripes(leaf, chunk); 3392 num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3287,6 +3547,15 @@ static int read_one_dev(struct btrfs_root *root,
3287 device = add_missing_dev(root, devid, dev_uuid); 3547 device = add_missing_dev(root, devid, dev_uuid);
3288 if (!device) 3548 if (!device)
3289 return -ENOMEM; 3549 return -ENOMEM;
3550 } else if (!device->missing) {
3551 /*
3552 * this happens when a device that was properly setup
3553 * in the device info lists suddenly goes bad.
3554 * device->bdev is NULL, and so we have to set
3555 * device->missing to one here
3556 */
3557 root->fs_info->fs_devices->missing_devices++;
3558 device->missing = 1;
3290 } 3559 }
3291 } 3560 }
3292 3561
@@ -3306,15 +3575,6 @@ static int read_one_dev(struct btrfs_root *root,
3306 return ret; 3575 return ret;
3307} 3576}
3308 3577
3309int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
3310{
3311 struct btrfs_dev_item *dev_item;
3312
3313 dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
3314 dev_item);
3315 return read_one_dev(root, buf, dev_item);
3316}
3317
3318int btrfs_read_sys_array(struct btrfs_root *root) 3578int btrfs_read_sys_array(struct btrfs_root *root)
3319{ 3579{
3320 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3580 struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -3431,7 +3691,7 @@ again:
3431 } 3691 }
3432 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { 3692 if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
3433 key.objectid = 0; 3693 key.objectid = 0;
3434 btrfs_release_path(root, path); 3694 btrfs_release_path(path);
3435 goto again; 3695 goto again;
3436 } 3696 }
3437 ret = 0; 3697 ret = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..7c12d61ae7ae 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
20#define __BTRFS_VOLUMES_ 20#define __BTRFS_VOLUMES_
21 21
22#include <linux/bio.h> 22#include <linux/bio.h>
23#include <linux/sort.h>
23#include "async-thread.h" 24#include "async-thread.h"
24 25
26#define BTRFS_STRIPE_LEN (64 * 1024)
27
25struct buffer_head; 28struct buffer_head;
26struct btrfs_pending_bios { 29struct btrfs_pending_bios {
27 struct bio *head; 30 struct bio *head;
@@ -42,15 +45,15 @@ struct btrfs_device {
42 int running_pending; 45 int running_pending;
43 u64 generation; 46 u64 generation;
44 47
45 int barriers;
46 int writeable; 48 int writeable;
47 int in_fs_metadata; 49 int in_fs_metadata;
50 int missing;
48 51
49 spinlock_t io_lock; 52 spinlock_t io_lock;
50 53
51 struct block_device *bdev; 54 struct block_device *bdev;
52 55
53 /* the mode sent to open_bdev_exclusive */ 56 /* the mode sent to blkdev_get */
54 fmode_t mode; 57 fmode_t mode;
55 58
56 char *name; 59 char *name;
@@ -82,7 +85,12 @@ struct btrfs_device {
82 /* physical drive uuid (or lvm uuid) */ 85 /* physical drive uuid (or lvm uuid) */
83 u8 uuid[BTRFS_UUID_SIZE]; 86 u8 uuid[BTRFS_UUID_SIZE];
84 87
88 /* per-device scrub information */
89 struct scrub_dev *scrub_device;
90
85 struct btrfs_work work; 91 struct btrfs_work work;
92 struct rcu_head rcu;
93 struct work_struct rcu_work;
86}; 94};
87 95
88struct btrfs_fs_devices { 96struct btrfs_fs_devices {
@@ -94,6 +102,7 @@ struct btrfs_fs_devices {
94 u64 num_devices; 102 u64 num_devices;
95 u64 open_devices; 103 u64 open_devices;
96 u64 rw_devices; 104 u64 rw_devices;
105 u64 missing_devices;
97 u64 total_rw_bytes; 106 u64 total_rw_bytes;
98 struct block_device *latest_bdev; 107 struct block_device *latest_bdev;
99 108
@@ -122,6 +131,7 @@ struct btrfs_fs_devices {
122struct btrfs_bio_stripe { 131struct btrfs_bio_stripe {
123 struct btrfs_device *dev; 132 struct btrfs_device *dev;
124 u64 physical; 133 u64 physical;
134 u64 length; /* only used for discard mappings */
125}; 135};
126 136
127struct btrfs_multi_bio { 137struct btrfs_multi_bio {
@@ -135,6 +145,30 @@ struct btrfs_multi_bio {
135 struct btrfs_bio_stripe stripes[]; 145 struct btrfs_bio_stripe stripes[];
136}; 146};
137 147
148struct btrfs_device_info {
149 struct btrfs_device *dev;
150 u64 dev_offset;
151 u64 max_avail;
152 u64 total_avail;
153};
154
155struct map_lookup {
156 u64 type;
157 int io_align;
158 int io_width;
159 int stripe_len;
160 int sector_size;
161 int num_stripes;
162 int sub_stripes;
163 struct btrfs_bio_stripe stripes[];
164};
165
166#define map_lookup_size(n) (sizeof(struct map_lookup) + \
167 (sizeof(struct btrfs_bio_stripe) * (n)))
168
169int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
170 u64 end, u64 *length);
171
138#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 172#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
139 (sizeof(struct btrfs_bio_stripe) * (n))) 173 (sizeof(struct btrfs_bio_stripe) * (n)))
140 174
@@ -156,7 +190,6 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
156void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); 190void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
157int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, 191int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
158 int mirror_num, int async_submit); 192 int mirror_num, int async_submit);
159int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
160int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, 193int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
161 fmode_t flags, void *holder); 194 fmode_t flags, void *holder);
162int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, 195int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
@@ -169,8 +202,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
169int btrfs_rm_device(struct btrfs_root *root, char *device_path); 202int btrfs_rm_device(struct btrfs_root *root, char *device_path);
170int btrfs_cleanup_fs_uuids(void); 203int btrfs_cleanup_fs_uuids(void);
171int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); 204int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
172int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
173 u64 logical, struct page *page);
174int btrfs_grow_device(struct btrfs_trans_handle *trans, 205int btrfs_grow_device(struct btrfs_trans_handle *trans,
175 struct btrfs_device *device, u64 new_size); 206 struct btrfs_device *device, u64 new_size);
176struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, 207struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
@@ -178,8 +209,6 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
178int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); 209int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
179int btrfs_init_new_device(struct btrfs_root *root, char *path); 210int btrfs_init_new_device(struct btrfs_root *root, char *path);
180int btrfs_balance(struct btrfs_root *dev_root); 211int btrfs_balance(struct btrfs_root *dev_root);
181void btrfs_unlock_volumes(void);
182void btrfs_lock_volumes(void);
183int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); 212int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
184int find_free_dev_extent(struct btrfs_trans_handle *trans, 213int find_free_dev_extent(struct btrfs_trans_handle *trans,
185 struct btrfs_device *device, u64 num_bytes, 214 struct btrfs_device *device, u64 num_bytes,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 88ecbb215878..5366fe452ab0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -44,7 +44,7 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
44 return -ENOMEM; 44 return -ENOMEM;
45 45
46 /* lookup the xattr by name */ 46 /* lookup the xattr by name */
47 di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, 47 di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
48 strlen(name), 0); 48 strlen(name), 0);
49 if (!di) { 49 if (!di) {
50 ret = -ENODATA; 50 ret = -ENODATA;
@@ -103,7 +103,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
103 return -ENOMEM; 103 return -ENOMEM;
104 104
105 /* first lets see if we already have this xattr */ 105 /* first lets see if we already have this xattr */
106 di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, 106 di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
107 strlen(name), -1); 107 strlen(name), -1);
108 if (IS_ERR(di)) { 108 if (IS_ERR(di)) {
109 ret = PTR_ERR(di); 109 ret = PTR_ERR(di);
@@ -120,13 +120,13 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
120 120
121 ret = btrfs_delete_one_dir_name(trans, root, path, di); 121 ret = btrfs_delete_one_dir_name(trans, root, path, di);
122 BUG_ON(ret); 122 BUG_ON(ret);
123 btrfs_release_path(root, path); 123 btrfs_release_path(path);
124 124
125 /* if we don't have a value then we are removing the xattr */ 125 /* if we don't have a value then we are removing the xattr */
126 if (!value) 126 if (!value)
127 goto out; 127 goto out;
128 } else { 128 } else {
129 btrfs_release_path(root, path); 129 btrfs_release_path(path);
130 130
131 if (flags & XATTR_REPLACE) { 131 if (flags & XATTR_REPLACE) {
132 /* we couldn't find the attr to replace */ 132 /* we couldn't find the attr to replace */
@@ -136,7 +136,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
136 } 136 }
137 137
138 /* ok we have to create a completely new xattr */ 138 /* ok we have to create a completely new xattr */
139 ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino, 139 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
140 name, name_len, value, size); 140 name, name_len, value, size);
141 BUG_ON(ret); 141 BUG_ON(ret);
142out: 142out:
@@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
158 if (IS_ERR(trans)) 158 if (IS_ERR(trans))
159 return PTR_ERR(trans); 159 return PTR_ERR(trans);
160 160
161 btrfs_set_trans_block_group(trans, inode);
162
163 ret = do_setxattr(trans, inode, name, value, size, flags); 161 ret = do_setxattr(trans, inode, name, value, size, flags);
164 if (ret) 162 if (ret)
165 goto out; 163 goto out;
@@ -178,21 +176,19 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
178 struct inode *inode = dentry->d_inode; 176 struct inode *inode = dentry->d_inode;
179 struct btrfs_root *root = BTRFS_I(inode)->root; 177 struct btrfs_root *root = BTRFS_I(inode)->root;
180 struct btrfs_path *path; 178 struct btrfs_path *path;
181 struct btrfs_item *item;
182 struct extent_buffer *leaf; 179 struct extent_buffer *leaf;
183 struct btrfs_dir_item *di; 180 struct btrfs_dir_item *di;
184 int ret = 0, slot, advance; 181 int ret = 0, slot;
185 size_t total_size = 0, size_left = size; 182 size_t total_size = 0, size_left = size;
186 unsigned long name_ptr; 183 unsigned long name_ptr;
187 size_t name_len; 184 size_t name_len;
188 u32 nritems;
189 185
190 /* 186 /*
191 * ok we want all objects associated with this id. 187 * ok we want all objects associated with this id.
192 * NOTE: we set key.offset = 0; because we want to start with the 188 * NOTE: we set key.offset = 0; because we want to start with the
193 * first xattr that we find and walk forward 189 * first xattr that we find and walk forward
194 */ 190 */
195 key.objectid = inode->i_ino; 191 key.objectid = btrfs_ino(inode);
196 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); 192 btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
197 key.offset = 0; 193 key.offset = 0;
198 194
@@ -205,36 +201,25 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
205 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 201 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
206 if (ret < 0) 202 if (ret < 0)
207 goto err; 203 goto err;
208 advance = 0; 204
209 while (1) { 205 while (1) {
210 leaf = path->nodes[0]; 206 leaf = path->nodes[0];
211 nritems = btrfs_header_nritems(leaf);
212 slot = path->slots[0]; 207 slot = path->slots[0];
213 208
214 /* this is where we start walking through the path */ 209 /* this is where we start walking through the path */
215 if (advance || slot >= nritems) { 210 if (slot >= btrfs_header_nritems(leaf)) {
216 /* 211 /*
217 * if we've reached the last slot in this leaf we need 212 * if we've reached the last slot in this leaf we need
218 * to go to the next leaf and reset everything 213 * to go to the next leaf and reset everything
219 */ 214 */
220 if (slot >= nritems-1) { 215 ret = btrfs_next_leaf(root, path);
221 ret = btrfs_next_leaf(root, path); 216 if (ret < 0)
222 if (ret) 217 goto err;
223 break; 218 else if (ret > 0)
224 leaf = path->nodes[0]; 219 break;
225 nritems = btrfs_header_nritems(leaf); 220 continue;
226 slot = path->slots[0];
227 } else {
228 /*
229 * just walking through the slots on this leaf
230 */
231 slot++;
232 path->slots[0]++;
233 }
234 } 221 }
235 advance = 1;
236 222
237 item = btrfs_item_nr(leaf, slot);
238 btrfs_item_key_to_cpu(leaf, &found_key, slot); 223 btrfs_item_key_to_cpu(leaf, &found_key, slot);
239 224
240 /* check to make sure this item is what we want */ 225 /* check to make sure this item is what we want */
@@ -244,13 +229,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
244 break; 229 break;
245 230
246 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); 231 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
232 if (verify_dir_item(root, leaf, di))
233 continue;
247 234
248 name_len = btrfs_dir_name_len(leaf, di); 235 name_len = btrfs_dir_name_len(leaf, di);
249 total_size += name_len + 1; 236 total_size += name_len + 1;
250 237
251 /* we are just looking for how big our buffer needs to be */ 238 /* we are just looking for how big our buffer needs to be */
252 if (!size) 239 if (!size)
253 continue; 240 goto next;
254 241
255 if (!buffer || (name_len + 1) > size_left) { 242 if (!buffer || (name_len + 1) > size_left) {
256 ret = -ERANGE; 243 ret = -ERANGE;
@@ -263,6 +250,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
263 250
264 size_left -= name_len + 1; 251 size_left -= name_len + 1;
265 buffer += name_len + 1; 252 buffer += name_len + 1;
253next:
254 path->slots[0]++;
266 } 255 }
267 ret = total_size; 256 ret = total_size;
268 257
@@ -318,6 +307,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
318int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, 307int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
319 size_t size, int flags) 308 size_t size, int flags)
320{ 309{
310 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
311
312 /*
313 * The permission on security.* and system.* is not checked
314 * in permission().
315 */
316 if (btrfs_root_readonly(root))
317 return -EROFS;
318
321 /* 319 /*
322 * If this is a request for a synthetic attribute in the system.* 320 * If this is a request for a synthetic attribute in the system.*
323 * namespace use the generic infrastructure to resolve a handler 321 * namespace use the generic infrastructure to resolve a handler
@@ -338,6 +336,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
338 336
339int btrfs_removexattr(struct dentry *dentry, const char *name) 337int btrfs_removexattr(struct dentry *dentry, const char *name)
340{ 338{
339 struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
340
341 /*
342 * The permission on security.* and system.* is not checked
343 * in permission().
344 */
345 if (btrfs_root_readonly(root))
346 return -EROFS;
347
341 /* 348 /*
342 * If this is a request for a synthetic attribute in the system.* 349 * If this is a request for a synthetic attribute in the system.*
343 * namespace use the generic infrastructure to resolve a handler 350 * namespace use the generic infrastructure to resolve a handler
@@ -354,7 +361,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
354} 361}
355 362
356int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 363int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
357 struct inode *inode, struct inode *dir) 364 struct inode *inode, struct inode *dir,
365 const struct qstr *qstr)
358{ 366{
359 int err; 367 int err;
360 size_t len; 368 size_t len;
@@ -362,7 +370,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
362 char *suffix; 370 char *suffix;
363 char *name; 371 char *name;
364 372
365 err = security_inode_init_security(inode, dir, &suffix, &value, &len); 373 err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
374 &len);
366 if (err) { 375 if (err) {
367 if (err == -EOPNOTSUPP) 376 if (err == -EOPNOTSUPP)
368 return 0; 377 return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
37extern int btrfs_removexattr(struct dentry *dentry, const char *name); 37extern int btrfs_removexattr(struct dentry *dentry, const char *name);
38 38
39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, 39extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
40 struct inode *inode, struct inode *dir); 40 struct inode *inode, struct inode *dir,
41 const struct qstr *qstr);
41 42
42#endif /* __XATTR__ */ 43#endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa239..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
32#include <linux/bio.h> 32#include <linux/bio.h>
33#include "compression.h" 33#include "compression.h"
34 34
35/* Plan: call deflate() with avail_in == *sourcelen,
36 avail_out = *dstlen - 12 and flush == Z_FINISH.
37 If it doesn't manage to finish, call it again with
38 avail_in == 0 and avail_out set to the remaining 12
39 bytes for it to clean up.
40 Q: Is 12 bytes sufficient?
41*/
42#define STREAM_END_SPACE 12
43
44struct workspace { 35struct workspace {
45 z_stream inf_strm; 36 z_stream inf_strm;
46 z_stream def_strm; 37 z_stream def_strm;
@@ -48,169 +39,63 @@ struct workspace {
48 struct list_head list; 39 struct list_head list;
49}; 40};
50 41
51static LIST_HEAD(idle_workspace); 42static void zlib_free_workspace(struct list_head *ws)
52static DEFINE_SPINLOCK(workspace_lock);
53static unsigned long num_workspace;
54static atomic_t alloc_workspace = ATOMIC_INIT(0);
55static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
56
57/*
58 * this finds an available zlib workspace or allocates a new one
59 * NULL or an ERR_PTR is returned if things go bad.
60 */
61static struct workspace *find_zlib_workspace(void)
62{ 43{
63 struct workspace *workspace; 44 struct workspace *workspace = list_entry(ws, struct workspace, list);
64 int ret;
65 int cpus = num_online_cpus();
66
67again:
68 spin_lock(&workspace_lock);
69 if (!list_empty(&idle_workspace)) {
70 workspace = list_entry(idle_workspace.next, struct workspace,
71 list);
72 list_del(&workspace->list);
73 num_workspace--;
74 spin_unlock(&workspace_lock);
75 return workspace;
76
77 }
78 spin_unlock(&workspace_lock);
79 if (atomic_read(&alloc_workspace) > cpus) {
80 DEFINE_WAIT(wait);
81 prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
82 if (atomic_read(&alloc_workspace) > cpus)
83 schedule();
84 finish_wait(&workspace_wait, &wait);
85 goto again;
86 }
87 atomic_inc(&alloc_workspace);
88 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
89 if (!workspace) {
90 ret = -ENOMEM;
91 goto fail;
92 }
93
94 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
95 if (!workspace->def_strm.workspace) {
96 ret = -ENOMEM;
97 goto fail;
98 }
99 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
100 if (!workspace->inf_strm.workspace) {
101 ret = -ENOMEM;
102 goto fail_inflate;
103 }
104 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
105 if (!workspace->buf) {
106 ret = -ENOMEM;
107 goto fail_kmalloc;
108 }
109 return workspace;
110 45
111fail_kmalloc:
112 vfree(workspace->inf_strm.workspace);
113fail_inflate:
114 vfree(workspace->def_strm.workspace);
115fail:
116 kfree(workspace);
117 atomic_dec(&alloc_workspace);
118 wake_up(&workspace_wait);
119 return ERR_PTR(ret);
120}
121
122/*
123 * put a workspace struct back on the list or free it if we have enough
124 * idle ones sitting around
125 */
126static int free_workspace(struct workspace *workspace)
127{
128 spin_lock(&workspace_lock);
129 if (num_workspace < num_online_cpus()) {
130 list_add_tail(&workspace->list, &idle_workspace);
131 num_workspace++;
132 spin_unlock(&workspace_lock);
133 if (waitqueue_active(&workspace_wait))
134 wake_up(&workspace_wait);
135 return 0;
136 }
137 spin_unlock(&workspace_lock);
138 vfree(workspace->def_strm.workspace); 46 vfree(workspace->def_strm.workspace);
139 vfree(workspace->inf_strm.workspace); 47 vfree(workspace->inf_strm.workspace);
140 kfree(workspace->buf); 48 kfree(workspace->buf);
141 kfree(workspace); 49 kfree(workspace);
142
143 atomic_dec(&alloc_workspace);
144 if (waitqueue_active(&workspace_wait))
145 wake_up(&workspace_wait);
146 return 0;
147} 50}
148 51
149/* 52static struct list_head *zlib_alloc_workspace(void)
150 * cleanup function for module exit
151 */
152static void free_workspaces(void)
153{ 53{
154 struct workspace *workspace; 54 struct workspace *workspace;
155 while (!list_empty(&idle_workspace)) { 55
156 workspace = list_entry(idle_workspace.next, struct workspace, 56 workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
157 list); 57 if (!workspace)
158 list_del(&workspace->list); 58 return ERR_PTR(-ENOMEM);
159 vfree(workspace->def_strm.workspace); 59
160 vfree(workspace->inf_strm.workspace); 60 workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
161 kfree(workspace->buf); 61 MAX_WBITS, MAX_MEM_LEVEL));
162 kfree(workspace); 62 workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
163 atomic_dec(&alloc_workspace); 63 workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
164 } 64 if (!workspace->def_strm.workspace ||
65 !workspace->inf_strm.workspace || !workspace->buf)
66 goto fail;
67
68 INIT_LIST_HEAD(&workspace->list);
69
70 return &workspace->list;
71fail:
72 zlib_free_workspace(&workspace->list);
73 return ERR_PTR(-ENOMEM);
165} 74}
166 75
167/* 76static int zlib_compress_pages(struct list_head *ws,
168 * given an address space and start/len, compress the bytes. 77 struct address_space *mapping,
169 * 78 u64 start, unsigned long len,
170 * pages are allocated to hold the compressed result and stored 79 struct page **pages,
171 * in 'pages' 80 unsigned long nr_dest_pages,
172 * 81 unsigned long *out_pages,
173 * out_pages is used to return the number of pages allocated. There 82 unsigned long *total_in,
174 * may be pages allocated even if we return an error 83 unsigned long *total_out,
175 * 84 unsigned long max_out)
176 * total_in is used to return the number of bytes actually read. It
177 * may be smaller then len if we had to exit early because we
178 * ran out of room in the pages array or because we cross the
179 * max_out threshold.
180 *
181 * total_out is used to return the total number of compressed bytes
182 *
183 * max_out tells us the max number of bytes that we're allowed to
184 * stuff into pages
185 */
186int btrfs_zlib_compress_pages(struct address_space *mapping,
187 u64 start, unsigned long len,
188 struct page **pages,
189 unsigned long nr_dest_pages,
190 unsigned long *out_pages,
191 unsigned long *total_in,
192 unsigned long *total_out,
193 unsigned long max_out)
194{ 85{
86 struct workspace *workspace = list_entry(ws, struct workspace, list);
195 int ret; 87 int ret;
196 struct workspace *workspace;
197 char *data_in; 88 char *data_in;
198 char *cpage_out; 89 char *cpage_out;
199 int nr_pages = 0; 90 int nr_pages = 0;
200 struct page *in_page = NULL; 91 struct page *in_page = NULL;
201 struct page *out_page = NULL; 92 struct page *out_page = NULL;
202 int out_written = 0;
203 int in_read = 0;
204 unsigned long bytes_left; 93 unsigned long bytes_left;
205 94
206 *out_pages = 0; 95 *out_pages = 0;
207 *total_out = 0; 96 *total_out = 0;
208 *total_in = 0; 97 *total_in = 0;
209 98
210 workspace = find_zlib_workspace();
211 if (IS_ERR(workspace))
212 return -1;
213
214 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { 99 if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
215 printk(KERN_WARNING "deflateInit failed\n"); 100 printk(KERN_WARNING "deflateInit failed\n");
216 ret = -1; 101 ret = -1;
@@ -224,6 +109,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
224 data_in = kmap(in_page); 109 data_in = kmap(in_page);
225 110
226 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 111 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
112 if (out_page == NULL) {
113 ret = -1;
114 goto out;
115 }
227 cpage_out = kmap(out_page); 116 cpage_out = kmap(out_page);
228 pages[0] = out_page; 117 pages[0] = out_page;
229 nr_pages = 1; 118 nr_pages = 1;
@@ -233,9 +122,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
233 workspace->def_strm.avail_out = PAGE_CACHE_SIZE; 122 workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
234 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); 123 workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
235 124
236 out_written = 0;
237 in_read = 0;
238
239 while (workspace->def_strm.total_in < len) { 125 while (workspace->def_strm.total_in < len) {
240 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); 126 ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
241 if (ret != Z_OK) { 127 if (ret != Z_OK) {
@@ -265,6 +151,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
265 goto out; 151 goto out;
266 } 152 }
267 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); 153 out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
154 if (out_page == NULL) {
155 ret = -1;
156 goto out;
157 }
268 cpage_out = kmap(out_page); 158 cpage_out = kmap(out_page);
269 pages[nr_pages] = out_page; 159 pages[nr_pages] = out_page;
270 nr_pages++; 160 nr_pages++;
@@ -319,55 +209,26 @@ out:
319 kunmap(in_page); 209 kunmap(in_page);
320 page_cache_release(in_page); 210 page_cache_release(in_page);
321 } 211 }
322 free_workspace(workspace);
323 return ret; 212 return ret;
324} 213}
325 214
326/* 215static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
327 * pages_in is an array of pages with compressed data. 216 u64 disk_start,
328 * 217 struct bio_vec *bvec,
329 * disk_start is the starting logical offset of this array in the file 218 int vcnt,
330 * 219 size_t srclen)
331 * bvec is a bio_vec of pages from the file that we want to decompress into
332 *
333 * vcnt is the count of pages in the biovec
334 *
335 * srclen is the number of bytes in pages_in
336 *
337 * The basic idea is that we have a bio that was created by readpages.
338 * The pages in the bio are for the uncompressed data, and they may not
339 * be contiguous. They all correspond to the range of bytes covered by
340 * the compressed extent.
341 */
342int btrfs_zlib_decompress_biovec(struct page **pages_in,
343 u64 disk_start,
344 struct bio_vec *bvec,
345 int vcnt,
346 size_t srclen)
347{ 220{
348 int ret = 0; 221 struct workspace *workspace = list_entry(ws, struct workspace, list);
222 int ret = 0, ret2;
349 int wbits = MAX_WBITS; 223 int wbits = MAX_WBITS;
350 struct workspace *workspace;
351 char *data_in; 224 char *data_in;
352 size_t total_out = 0; 225 size_t total_out = 0;
353 unsigned long page_bytes_left;
354 unsigned long page_in_index = 0; 226 unsigned long page_in_index = 0;
355 unsigned long page_out_index = 0; 227 unsigned long page_out_index = 0;
356 struct page *page_out;
357 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / 228 unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
358 PAGE_CACHE_SIZE; 229 PAGE_CACHE_SIZE;
359 unsigned long buf_start; 230 unsigned long buf_start;
360 unsigned long buf_offset;
361 unsigned long bytes;
362 unsigned long working_bytes;
363 unsigned long pg_offset; 231 unsigned long pg_offset;
364 unsigned long start_byte;
365 unsigned long current_buf_start;
366 char *kaddr;
367
368 workspace = find_zlib_workspace();
369 if (IS_ERR(workspace))
370 return -ENOMEM;
371 232
372 data_in = kmap(pages_in[page_in_index]); 233 data_in = kmap(pages_in[page_in_index]);
373 workspace->inf_strm.next_in = data_in; 234 workspace->inf_strm.next_in = data_in;
@@ -377,8 +238,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
377 workspace->inf_strm.total_out = 0; 238 workspace->inf_strm.total_out = 0;
378 workspace->inf_strm.next_out = workspace->buf; 239 workspace->inf_strm.next_out = workspace->buf;
379 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 240 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
380 page_out = bvec[page_out_index].bv_page;
381 page_bytes_left = PAGE_CACHE_SIZE;
382 pg_offset = 0; 241 pg_offset = 0;
383 242
384 /* If it's deflate, and it's got no preset dictionary, then 243 /* If it's deflate, and it's got no preset dictionary, then
@@ -394,107 +253,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
394 253
395 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 254 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
396 printk(KERN_WARNING "inflateInit failed\n"); 255 printk(KERN_WARNING "inflateInit failed\n");
397 ret = -1; 256 return -1;
398 goto out;
399 } 257 }
400 while (workspace->inf_strm.total_in < srclen) { 258 while (workspace->inf_strm.total_in < srclen) {
401 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); 259 ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
402 if (ret != Z_OK && ret != Z_STREAM_END) 260 if (ret != Z_OK && ret != Z_STREAM_END)
403 break; 261 break;
404 /*
405 * buf start is the byte offset we're of the start of
406 * our workspace buffer
407 */
408 buf_start = total_out;
409 262
410 /* total_out is the last byte of the workspace buffer */ 263 buf_start = total_out;
411 total_out = workspace->inf_strm.total_out; 264 total_out = workspace->inf_strm.total_out;
412 265
413 working_bytes = total_out - buf_start; 266 /* we didn't make progress in this inflate call, we're done */
414 267 if (buf_start == total_out)
415 /*
416 * start byte is the first byte of the page we're currently
417 * copying into relative to the start of the compressed data.
418 */
419 start_byte = page_offset(page_out) - disk_start;
420
421 if (working_bytes == 0) {
422 /* we didn't make progress in this inflate
423 * call, we're done
424 */
425 if (ret != Z_STREAM_END)
426 ret = -1;
427 break; 268 break;
428 }
429 269
430 /* we haven't yet hit data corresponding to this page */ 270 ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
431 if (total_out <= start_byte) 271 total_out, disk_start,
432 goto next; 272 bvec, vcnt,
433 273 &page_out_index, &pg_offset);
434 /* 274 if (ret2 == 0) {
435 * the start of the data we care about is offset into 275 ret = 0;
436 * the middle of our working buffer 276 goto done;
437 */
438 if (total_out > start_byte && buf_start < start_byte) {
439 buf_offset = start_byte - buf_start;
440 working_bytes -= buf_offset;
441 } else {
442 buf_offset = 0;
443 }
444 current_buf_start = buf_start;
445
446 /* copy bytes from the working buffer into the pages */
447 while (working_bytes > 0) {
448 bytes = min(PAGE_CACHE_SIZE - pg_offset,
449 PAGE_CACHE_SIZE - buf_offset);
450 bytes = min(bytes, working_bytes);
451 kaddr = kmap_atomic(page_out, KM_USER0);
452 memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
453 bytes);
454 kunmap_atomic(kaddr, KM_USER0);
455 flush_dcache_page(page_out);
456
457 pg_offset += bytes;
458 page_bytes_left -= bytes;
459 buf_offset += bytes;
460 working_bytes -= bytes;
461 current_buf_start += bytes;
462
463 /* check if we need to pick another page */
464 if (page_bytes_left == 0) {
465 page_out_index++;
466 if (page_out_index >= vcnt) {
467 ret = 0;
468 goto done;
469 }
470
471 page_out = bvec[page_out_index].bv_page;
472 pg_offset = 0;
473 page_bytes_left = PAGE_CACHE_SIZE;
474 start_byte = page_offset(page_out) - disk_start;
475
476 /*
477 * make sure our new page is covered by this
478 * working buffer
479 */
480 if (total_out <= start_byte)
481 goto next;
482
483 /* the next page in the biovec might not
484 * be adjacent to the last page, but it
485 * might still be found inside this working
486 * buffer. bump our offset pointer
487 */
488 if (total_out > start_byte &&
489 current_buf_start < start_byte) {
490 buf_offset = start_byte - buf_start;
491 working_bytes = total_out - start_byte;
492 current_buf_start = buf_start +
493 buf_offset;
494 }
495 }
496 } 277 }
497next: 278
498 workspace->inf_strm.next_out = workspace->buf; 279 workspace->inf_strm.next_out = workspace->buf;
499 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; 280 workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
500 281
@@ -521,35 +302,21 @@ done:
521 zlib_inflateEnd(&workspace->inf_strm); 302 zlib_inflateEnd(&workspace->inf_strm);
522 if (data_in) 303 if (data_in)
523 kunmap(pages_in[page_in_index]); 304 kunmap(pages_in[page_in_index]);
524out:
525 free_workspace(workspace);
526 return ret; 305 return ret;
527} 306}
528 307
529/* 308static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
530 * a less complex decompression routine. Our compressed data fits in a 309 struct page *dest_page,
531 * single page, and we want to read a single page out of it. 310 unsigned long start_byte,
532 * start_byte tells us the offset into the compressed data we're interested in 311 size_t srclen, size_t destlen)
533 */
534int btrfs_zlib_decompress(unsigned char *data_in,
535 struct page *dest_page,
536 unsigned long start_byte,
537 size_t srclen, size_t destlen)
538{ 312{
313 struct workspace *workspace = list_entry(ws, struct workspace, list);
539 int ret = 0; 314 int ret = 0;
540 int wbits = MAX_WBITS; 315 int wbits = MAX_WBITS;
541 struct workspace *workspace;
542 unsigned long bytes_left = destlen; 316 unsigned long bytes_left = destlen;
543 unsigned long total_out = 0; 317 unsigned long total_out = 0;
544 char *kaddr; 318 char *kaddr;
545 319
546 if (destlen > PAGE_CACHE_SIZE)
547 return -ENOMEM;
548
549 workspace = find_zlib_workspace();
550 if (IS_ERR(workspace))
551 return -ENOMEM;
552
553 workspace->inf_strm.next_in = data_in; 320 workspace->inf_strm.next_in = data_in;
554 workspace->inf_strm.avail_in = srclen; 321 workspace->inf_strm.avail_in = srclen;
555 workspace->inf_strm.total_in = 0; 322 workspace->inf_strm.total_in = 0;
@@ -570,8 +337,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
570 337
571 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { 338 if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
572 printk(KERN_WARNING "inflateInit failed\n"); 339 printk(KERN_WARNING "inflateInit failed\n");
573 ret = -1; 340 return -1;
574 goto out;
575 } 341 }
576 342
577 while (bytes_left > 0) { 343 while (bytes_left > 0) {
@@ -621,12 +387,13 @@ next:
621 ret = 0; 387 ret = 0;
622 388
623 zlib_inflateEnd(&workspace->inf_strm); 389 zlib_inflateEnd(&workspace->inf_strm);
624out:
625 free_workspace(workspace);
626 return ret; 390 return ret;
627} 391}
628 392
629void btrfs_zlib_exit(void) 393struct btrfs_compress_op btrfs_zlib_compress = {
630{ 394 .alloc_workspace = zlib_alloc_workspace,
631 free_workspaces(); 395 .free_workspace = zlib_free_workspace,
632} 396 .compress_pages = zlib_compress_pages,
397 .decompress_biovec = zlib_decompress_biovec,
398 .decompress = zlib_decompress,
399};