aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-11-06 23:03:41 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2011-11-06 23:03:41 -0500
commit6a6662ced4153f6dbcfc40d7225c3cc45416039c (patch)
tree77ad5d577333f02cd854e44827a407dd0388d4eb /fs/btrfs
parent32aaeffbd4a7457bf2f7448b33b5946ff2a960eb (diff)
parent7c7e82a77fe3d89ae50824aa7c897454675eb4c4 (diff)
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mason/linux-btrfs: (114 commits) Btrfs: check for a null fs root when writing to the backup root log Btrfs: fix race during transaction joins Btrfs: fix a potential btrfs_bio leak on scrub fixups Btrfs: rename btrfs_bio multi -> bbio for consistency Btrfs: stop leaking btrfs_bios on readahead Btrfs: stop the readahead threads on failed mount Btrfs: fix extent_buffer leak in the metadata IO error handling Btrfs: fix the new inspection ioctls for 32 bit compat Btrfs: fix delayed insertion reservation Btrfs: ClearPageError during writepage and clean_tree_block Btrfs: be smarter about committing the transaction in reserve_metadata_bytes Btrfs: make a delayed_block_rsv for the delayed item insertion Btrfs: add a log of past tree roots btrfs: separate superblock items out of fs_info Btrfs: use the global reserve when truncating the free space cache inode Btrfs: release metadata from global reserve if we have to fallback for unlink Btrfs: make sure to flush queued bios if write_cache_pages waits Btrfs: fix extent pinning bugs in the tree log Btrfs: make sure btrfs_remove_free_space doesn't leak EAGAIN Btrfs: don't wait as long for more batches during SSD log commit ...
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/btrfs_inode.h17
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h198
-rw-r--r--fs/btrfs/delayed-inode.c50
-rw-r--r--fs/btrfs/disk-io.c441
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c848
-rw-r--r--fs/btrfs/extent_io.c614
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c25
-rw-r--r--fs/btrfs/free-space-cache.c926
-rw-r--r--fs/btrfs/inode-map.c6
-rw-r--r--fs/btrfs/inode.c457
-rw-r--r--fs/btrfs/ioctl.c227
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c951
-rw-r--r--fs/btrfs/relocation.c24
-rw-r--r--fs/btrfs/scrub.c591
-rw-r--r--fs/btrfs/super.c298
-rw-r--r--fs/btrfs/transaction.c146
-rw-r--r--fs/btrfs/tree-log.c19
-rw-r--r--fs/btrfs/volumes.c207
-rw-r--r--fs/btrfs/volumes.h18
-rw-r--r--fs/btrfs/xattr.c11
31 files changed, 5423 insertions, 1603 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..8855aad3929c
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->val[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..5a5d325a3935 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..0fe615e4ea38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 902
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 903 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 904
905 if (level < BTRFS_MAX_LEVEL - 1) 905 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 906 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 907 pslot = path->slots[level + 1];
908 }
908 909
909 /* 910 /*
910 * deal with the case where there is only one pointer in the root 911 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1108 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1109 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1110
1110 if (level < BTRFS_MAX_LEVEL - 1) 1111 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1112 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1113 pslot = path->slots[level + 1];
1114 }
1113 1115
1114 if (!parent) 1116 if (!parent)
1115 return 1; 1117 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..b9ba59ff9292 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -840,10 +877,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 877 spinlock_t lock;
841 u64 pinned; 878 u64 pinned;
842 u64 reserved; 879 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 880 u64 bytes_super;
845 u64 flags; 881 u64 flags;
846 u64 sectorsize; 882 u64 sectorsize;
883 u64 cache_generation;
847 unsigned int ro:1; 884 unsigned int ro:1;
848 unsigned int dirty:1; 885 unsigned int dirty:1;
849 unsigned int iref:1; 886 unsigned int iref:1;
@@ -899,6 +936,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 936 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 937 struct rb_root block_group_cache_tree;
901 938
939 /* keep track of unallocated space */
940 spinlock_t free_chunk_lock;
941 u64 free_chunk_space;
942
902 struct extent_io_tree freed_extents[2]; 943 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 944 struct extent_io_tree *pinned_extents;
904 945
@@ -916,14 +957,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 957 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 958 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 959 struct btrfs_block_rsv chunk_block_rsv;
960 /* block reservation for delayed operations */
961 struct btrfs_block_rsv delayed_block_rsv;
919 962
920 struct btrfs_block_rsv empty_block_rsv; 963 struct btrfs_block_rsv empty_block_rsv;
921 964
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 965 u64 generation;
928 u64 last_trans_committed; 966 u64 last_trans_committed;
929 967
@@ -942,8 +980,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 980 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 981 wait_queue_head_t async_submit_wait;
944 982
945 struct btrfs_super_block super_copy; 983 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 984 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 985 struct block_device *__bdev;
948 struct super_block *sb; 986 struct super_block *sb;
949 struct inode *btree_inode; 987 struct inode *btree_inode;
@@ -1036,6 +1074,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1074 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1075 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1076 struct btrfs_workers caching_workers;
1077 struct btrfs_workers readahead_workers;
1039 1078
1040 /* 1079 /*
1041 * fixup workers take dirty pages that didn't properly go through 1080 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1158,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1158 u64 fs_state;
1120 1159
1121 struct btrfs_delayed_root *delayed_root; 1160 struct btrfs_delayed_root *delayed_root;
1161
1162 /* readahead tree */
1163 spinlock_t reada_lock;
1164 struct radix_tree_root reada_tree;
1165
1166 /* next backup root to be overwritten */
1167 int backup_root_index;
1122}; 1168};
1123 1169
1124/* 1170/*
@@ -1363,6 +1409,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1409#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1410#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1411#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1412#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1413
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1414#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1415#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2025,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2025 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2026}
1980 2027
2028/* struct btrfs_root_backup */
2029BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2030 tree_root, 64);
2031BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2032 tree_root_gen, 64);
2033BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2034 tree_root_level, 8);
2035
2036BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2037 chunk_root, 64);
2038BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2039 chunk_root_gen, 64);
2040BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2041 chunk_root_level, 8);
2042
2043BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2044 extent_root, 64);
2045BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2046 extent_root_gen, 64);
2047BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2048 extent_root_level, 8);
2049
2050BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2051 fs_root, 64);
2052BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2053 fs_root_gen, 64);
2054BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2055 fs_root_level, 8);
2056
2057BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2058 dev_root, 64);
2059BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2060 dev_root_gen, 64);
2061BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2062 dev_root_level, 8);
2063
2064BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2065 csum_root, 64);
2066BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2067 csum_root_gen, 64);
2068BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2069 csum_root_level, 8);
2070BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2071 total_bytes, 64);
2072BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2073 bytes_used, 64);
2074BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2075 num_devices, 64);
2076
1981/* struct btrfs_super_block */ 2077/* struct btrfs_super_block */
1982 2078
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2079BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2225,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2225 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2226}
2131 2227
2228static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2229{
2230 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2231}
2232
2132/* extent-tree.c */ 2233/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2234static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2235 unsigned num_items)
@@ -2137,6 +2238,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2238 3 * num_items;
2138} 2239}
2139 2240
2241/*
2242 * Doing a truncate won't result in new nodes or leaves, just what we need for
2243 * COW.
2244 */
2245static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2246 unsigned num_items)
2247{
2248 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2249 num_items;
2250}
2251
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2252void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2253int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2254 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2258,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2258 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2259int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2260 u64 bytenr, u64 num, int reserved);
2261int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2262 struct btrfs_root *root,
2263 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2264int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2265 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2266 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2311,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2311 u64 root_objectid, u64 owner, u64 offset);
2197 2312
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2313int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2314int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2315 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2316int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2317 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2318int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2355,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2355struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2356void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2357 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2358int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2359 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2360 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2361int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2362 struct btrfs_block_rsv *block_rsv,
2363 u64 num_bytes);
2364int btrfs_block_rsv_check(struct btrfs_root *root,
2365 struct btrfs_block_rsv *block_rsv, int min_factor);
2366int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2367 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2368 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2369int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2370 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2371 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2372void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2373 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2374 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2375int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2376 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2377int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2492,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2492 smp_mb();
2380 return fs_info->closing; 2493 return fs_info->closing;
2381} 2494}
2495static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2496{
2497 kfree(fs_info->delayed_root);
2498 kfree(fs_info->extent_root);
2499 kfree(fs_info->tree_root);
2500 kfree(fs_info->chunk_root);
2501 kfree(fs_info->dev_root);
2502 kfree(fs_info->csum_root);
2503 kfree(fs_info->super_copy);
2504 kfree(fs_info->super_for_commit);
2505 kfree(fs_info);
2506}
2382 2507
2383/* root-item.c */ 2508/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2509int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2704,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2704int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2705int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2706int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2707void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2708 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2709int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2817,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2817int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2818 struct btrfs_scrub_progress *progress);
2699 2819
2820/* reada.c */
2821struct reada_control {
2822 struct btrfs_root *root; /* tree to prefetch */
2823 struct btrfs_key key_start;
2824 struct btrfs_key key_end; /* exclusive */
2825 atomic_t elems;
2826 struct kref refcnt;
2827 wait_queue_head_t wait;
2828};
2829struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2830 struct btrfs_key *start, struct btrfs_key *end);
2831int btrfs_reada_wait(void *handle);
2832void btrfs_reada_detach(void *handle);
2833int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2834 u64 start, int err);
2835
2700#endif 2836#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index ae4d9cd10961..3a1b939c9ae2 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(
624 u64 num_bytes; 624 u64 num_bytes;
625 int ret; 625 int ret;
626 626
627 if (!trans->bytes_reserved)
628 return 0;
629
630 src_rsv = trans->block_rsv; 627 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 628 dst_rsv = &root->fs_info->delayed_block_rsv;
632 629
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 630 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
631
632 /*
633 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
634 * which doesn't reserve space for speed. This is a problem since we
635 * still need to reserve space for this update, so try to reserve the
636 * space.
637 *
638 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
639 * we're accounted for.
640 */
641 if (!trans->bytes_reserved &&
642 src_rsv != &root->fs_info->delalloc_block_rsv) {
643 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
644 /*
645 * Since we're under a transaction reserve_metadata_bytes could
646 * try to commit the transaction which will make it return
647 * EAGAIN to make us stop the transaction we have, so return
648 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
649 */
650 if (ret == -EAGAIN)
651 ret = -ENOSPC;
652 if (!ret)
653 node->bytes_reserved = num_bytes;
654 return ret;
655 }
656
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 657 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
635 if (!ret) 658 if (!ret)
636 node->bytes_reserved = num_bytes; 659 node->bytes_reserved = num_bytes;
@@ -646,7 +669,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 669 if (!node->bytes_reserved)
647 return; 670 return;
648 671
649 rsv = &root->fs_info->global_block_rsv; 672 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 673 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 674 node->bytes_reserved);
652 node->bytes_reserved = 0; 675 node->bytes_reserved = 0;
@@ -1026,7 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1049 path->leave_spinning = 1;
1027 1050
1028 block_rsv = trans->block_rsv; 1051 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1052 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1053
1031 delayed_root = btrfs_get_delayed_root(root); 1054 delayed_root = btrfs_get_delayed_root(root);
1032 1055
@@ -1069,7 +1092,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1092 path->leave_spinning = 1;
1070 1093
1071 block_rsv = trans->block_rsv; 1094 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1095 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1096
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1097 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1098 if (!ret)
@@ -1149,7 +1172,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1172 goto free_path;
1150 1173
1151 block_rsv = trans->block_rsv; 1174 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1175 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1176
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1177 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1178 if (!ret)
@@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1686 } 1709 }
1687 1710
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1711 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
1689 /* 1712 if (ret)
1690 * we must reserve enough space when we start a new transaction, 1713 goto release_node;
1691 * so reserving metadata failure is impossible
1692 */
1693 BUG_ON(ret);
1694 1714
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1715 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1716 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07ea91879a91..102c176fc29c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 u64 mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647 free_extent_buffer(eb);
648
649out:
650 return -EIO; /* we fixed nothing */
651}
652
616static void end_workqueue_bio(struct bio *bio, int err) 653static void end_workqueue_bio(struct bio *bio, int err)
617{ 654{
618 struct end_io_wq *end_io_wq = bio->bi_private; 655 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
908{ 945{
909 struct extent_io_tree *tree; 946 struct extent_io_tree *tree;
910 tree = &BTRFS_I(page->mapping->host)->io_tree; 947 tree = &BTRFS_I(page->mapping->host)->io_tree;
911 return extent_read_full_page(tree, page, btree_get_extent); 948 return extent_read_full_page(tree, page, btree_get_extent, 0);
912} 949}
913 950
914static int btree_releasepage(struct page *page, gfp_t gfp_flags) 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1011 if (!buf)
975 return 0; 1012 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1013 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1014 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1015 free_extent_buffer(buf);
979 return ret; 1016 return ret;
980} 1017}
981 1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020 int mirror_num, struct extent_buffer **eb)
1021{
1022 struct extent_buffer *buf = NULL;
1023 struct inode *btree_inode = root->fs_info->btree_inode;
1024 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025 int ret;
1026
1027 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028 if (!buf)
1029 return 0;
1030
1031 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034 btree_get_extent, mirror_num);
1035 if (ret) {
1036 free_extent_buffer(buf);
1037 return ret;
1038 }
1039
1040 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041 free_extent_buffer(buf);
1042 return -EIO;
1043 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044 *eb = buf;
1045 } else {
1046 free_extent_buffer(buf);
1047 }
1048 return 0;
1049}
1050
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1052 u64 bytenr, u32 blocksize)
984{ 1053{
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1204
1136 generation = btrfs_root_generation(&root->root_item); 1205 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1206 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1208 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1209 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1210 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1211 free_extent_buffer(root->node);
1212 root->node = NULL;
1142 return -EIO; 1213 return -EIO;
1143 } 1214 }
1144 root->commit_root = btrfs_root_node(root); 1215 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
1577 return 0; 1648 return 0;
1578} 1649}
1579 1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups. The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest root in the array with the generation
1658 * in the super block. If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662 u64 cur;
1663 int newest_index = -1;
1664 struct btrfs_root_backup *root_backup;
1665 int i;
1666
1667 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668 root_backup = info->super_copy->super_roots + i;
1669 cur = btrfs_backup_tree_root_gen(root_backup);
1670 if (cur == newest_gen)
1671 newest_index = i;
1672 }
1673
1674 /* check to see if we actually wrapped around */
1675 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676 root_backup = info->super_copy->super_roots;
1677 cur = btrfs_backup_tree_root_gen(root_backup);
1678 if (cur == newest_gen)
1679 newest_index = 0;
1680 }
1681 return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array. This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691 u64 newest_gen)
1692{
1693 int newest_index = -1;
1694
1695 newest_index = find_newest_super_backup(info, newest_gen);
1696 /* if there was garbage in there, just move along */
1697 if (newest_index == -1) {
1698 info->backup_root_index = 0;
1699 } else {
1700 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701 }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711 int next_backup;
1712 struct btrfs_root_backup *root_backup;
1713 int last_backup;
1714
1715 next_backup = info->backup_root_index;
1716 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717 BTRFS_NUM_BACKUP_ROOTS;
1718
1719 /*
1720 * just overwrite the last backup if we're at the same generation
1721 * this happens only at umount
1722 */
1723 root_backup = info->super_for_commit->super_roots + last_backup;
1724 if (btrfs_backup_tree_root_gen(root_backup) ==
1725 btrfs_header_generation(info->tree_root->node))
1726 next_backup = last_backup;
1727
1728 root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730 /*
1731 * make sure all of our padding and empty slots get zero filled
1732 * regardless of which ones we use today
1733 */
1734 memset(root_backup, 0, sizeof(*root_backup));
1735
1736 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739 btrfs_set_backup_tree_root_gen(root_backup,
1740 btrfs_header_generation(info->tree_root->node));
1741
1742 btrfs_set_backup_tree_root_level(root_backup,
1743 btrfs_header_level(info->tree_root->node));
1744
1745 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746 btrfs_set_backup_chunk_root_gen(root_backup,
1747 btrfs_header_generation(info->chunk_root->node));
1748 btrfs_set_backup_chunk_root_level(root_backup,
1749 btrfs_header_level(info->chunk_root->node));
1750
1751 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752 btrfs_set_backup_extent_root_gen(root_backup,
1753 btrfs_header_generation(info->extent_root->node));
1754 btrfs_set_backup_extent_root_level(root_backup,
1755 btrfs_header_level(info->extent_root->node));
1756
1757 /*
1758 * we might commit during log recovery, which happens before we set
1759 * the fs_root. Make sure it is valid before we fill it in.
1760 */
1761 if (info->fs_root && info->fs_root->node) {
1762 btrfs_set_backup_fs_root(root_backup,
1763 info->fs_root->node->start);
1764 btrfs_set_backup_fs_root_gen(root_backup,
1765 btrfs_header_generation(info->fs_root->node));
1766 btrfs_set_backup_fs_root_level(root_backup,
1767 btrfs_header_level(info->fs_root->node));
1768 }
1769
1770 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771 btrfs_set_backup_dev_root_gen(root_backup,
1772 btrfs_header_generation(info->dev_root->node));
1773 btrfs_set_backup_dev_root_level(root_backup,
1774 btrfs_header_level(info->dev_root->node));
1775
1776 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777 btrfs_set_backup_csum_root_gen(root_backup,
1778 btrfs_header_generation(info->csum_root->node));
1779 btrfs_set_backup_csum_root_level(root_backup,
1780 btrfs_header_level(info->csum_root->node));
1781
1782 btrfs_set_backup_total_bytes(root_backup,
1783 btrfs_super_total_bytes(info->super_copy));
1784 btrfs_set_backup_bytes_used(root_backup,
1785 btrfs_super_bytes_used(info->super_copy));
1786 btrfs_set_backup_num_devices(root_backup,
1787 btrfs_super_num_devices(info->super_copy));
1788
1789 /*
1790 * if we don't copy this out to the super_copy, it won't get remembered
1791 * for the next commit
1792 */
1793 memcpy(&info->super_copy->super_roots,
1794 &info->super_for_commit->super_roots,
1795 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block. It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807 struct btrfs_super_block *super,
1808 int *num_backups_tried, int *backup_index)
1809{
1810 struct btrfs_root_backup *root_backup;
1811 int newest = *backup_index;
1812
1813 if (*num_backups_tried == 0) {
1814 u64 gen = btrfs_super_generation(super);
1815
1816 newest = find_newest_super_backup(info, gen);
1817 if (newest == -1)
1818 return -1;
1819
1820 *backup_index = newest;
1821 *num_backups_tried = 1;
1822 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823 /* we've tried all the backups, all done */
1824 return -1;
1825 } else {
1826 /* jump to the next oldest backup */
1827 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828 BTRFS_NUM_BACKUP_ROOTS;
1829 *backup_index = newest;
1830 *num_backups_tried += 1;
1831 }
1832 root_backup = super->super_roots + newest;
1833
1834 btrfs_set_super_generation(super,
1835 btrfs_backup_tree_root_gen(root_backup));
1836 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837 btrfs_set_super_root_level(super,
1838 btrfs_backup_tree_root_level(root_backup));
1839 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841 /*
1842 * fixme: the total bytes and num_devices need to match or we should
1843 * need a fsck
1844 */
1845 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847 return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853 free_extent_buffer(info->tree_root->node);
1854 free_extent_buffer(info->tree_root->commit_root);
1855 free_extent_buffer(info->dev_root->node);
1856 free_extent_buffer(info->dev_root->commit_root);
1857 free_extent_buffer(info->extent_root->node);
1858 free_extent_buffer(info->extent_root->commit_root);
1859 free_extent_buffer(info->csum_root->node);
1860 free_extent_buffer(info->csum_root->commit_root);
1861
1862 info->tree_root->node = NULL;
1863 info->tree_root->commit_root = NULL;
1864 info->dev_root->node = NULL;
1865 info->dev_root->commit_root = NULL;
1866 info->extent_root->node = NULL;
1867 info->extent_root->commit_root = NULL;
1868 info->csum_root->node = NULL;
1869 info->csum_root->commit_root = NULL;
1870
1871 if (chunk_root) {
1872 free_extent_buffer(info->chunk_root->node);
1873 free_extent_buffer(info->chunk_root->commit_root);
1874 info->chunk_root->node = NULL;
1875 info->chunk_root->commit_root = NULL;
1876 }
1877}
1878
1879
1580struct btrfs_root *open_ctree(struct super_block *sb, 1880struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1881 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1882 char *options)
@@ -1604,6 +1904,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1604 1904
1605 int ret; 1905 int ret;
1606 int err = -EINVAL; 1906 int err = -EINVAL;
1907 int num_backups_tried = 0;
1908 int backup_index = 0;
1607 1909
1608 struct btrfs_super_block *disk_super; 1910 struct btrfs_super_block *disk_super;
1609 1911
@@ -1648,6 +1950,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1950 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1951 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1952 spin_lock_init(&fs_info->defrag_inodes_lock);
1953 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1954 mutex_init(&fs_info->reloc_mutex);
1652 1955
1653 init_completion(&fs_info->kobj_unregister); 1956 init_completion(&fs_info->kobj_unregister);
@@ -1665,8 +1968,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1968 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1969 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1970 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1971 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1972 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1973 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1974 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1979,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1979 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1980 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1981 fs_info->trans_no_join = 0;
1982 fs_info->free_chunk_space = 0;
1983
1984 /* readahead state */
1985 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1986 spin_lock_init(&fs_info->reada_lock);
1680 1987
1681 fs_info->thread_pool_size = min_t(unsigned long, 1988 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1989 num_online_cpus() + 2, 8);
@@ -1766,14 +2073,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2073 goto fail_alloc;
1767 } 2074 }
1768 2075
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2076 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2077 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2078 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2079 brelse(bh);
1773 2080
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2081 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2082
1776 disk_super = &fs_info->super_copy; 2083 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2084 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2085 goto fail_alloc;
1779 2086
@@ -1783,6 +2090,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2090 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2091
1785 /* 2092 /*
2093 * run through our array of backup supers and setup
2094 * our ring pointer to the oldest one
2095 */
2096 generation = btrfs_super_generation(disk_super);
2097 find_oldest_super_backup(fs_info, generation);
2098
2099 /*
1786 * In the long term, we'll store the compression type in the super 2100 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2101 * block, and it'll be used for per file compression control.
1788 */ 2102 */
@@ -1870,6 +2184,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2184 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2185 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2186 &fs_info->generic_worker);
2187 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2188 fs_info->thread_pool_size,
2189 &fs_info->generic_worker);
1873 2190
1874 /* 2191 /*
1875 * endios are largely parallel and should have a very 2192 * endios are largely parallel and should have a very
@@ -1880,6 +2197,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2197
1881 fs_info->endio_write_workers.idle_thresh = 2; 2198 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2199 fs_info->endio_meta_write_workers.idle_thresh = 2;
2200 fs_info->readahead_workers.idle_thresh = 2;
1883 2201
1884 btrfs_start_workers(&fs_info->workers, 1); 2202 btrfs_start_workers(&fs_info->workers, 1);
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2203 btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2211,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2211 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2212 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2213 btrfs_start_workers(&fs_info->caching_workers, 1);
2214 btrfs_start_workers(&fs_info->readahead_workers, 1);
1896 2215
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2216 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2217 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2258,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2258 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2259 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2260 sb->s_id);
1942 goto fail_chunk_root; 2261 goto fail_tree_roots;
1943 } 2262 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2263 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2264 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2273,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2273 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2274 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2275 sb->s_id);
1957 goto fail_chunk_root; 2276 goto fail_tree_roots;
1958 } 2277 }
1959 2278
1960 btrfs_close_extra_devices(fs_devices); 2279 btrfs_close_extra_devices(fs_devices);
1961 2280
2281retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2282 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2283 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2284 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2286,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2286 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2287 btrfs_super_root(disk_super),
1968 blocksize, generation); 2288 blocksize, generation);
1969 if (!tree_root->node) 2289 if (!tree_root->node ||
1970 goto fail_chunk_root; 2290 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2291 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2292 sb->s_id);
1974 goto fail_tree_root; 2293
2294 goto recovery_tree_root;
1975 } 2295 }
2296
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2297 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2298 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2299
1979 ret = find_and_setup_root(tree_root, fs_info, 2300 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2301 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2302 if (ret)
1982 goto fail_tree_root; 2303 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2304 extent_root->track_dirty = 1;
1984 2305
1985 ret = find_and_setup_root(tree_root, fs_info, 2306 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2307 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2308 if (ret)
1988 goto fail_extent_root; 2309 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2310 dev_root->track_dirty = 1;
1990 2311
1991 ret = find_and_setup_root(tree_root, fs_info, 2312 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2313 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2314 if (ret)
1994 goto fail_dev_root; 2315 goto recovery_tree_root;
1995 2316
1996 csum_root->track_dirty = 1; 2317 csum_root->track_dirty = 1;
1997 2318
@@ -2124,22 +2445,13 @@ fail_cleaner:
2124 2445
2125fail_block_groups: 2446fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2447 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2448
2128 free_extent_buffer(csum_root->commit_root); 2449fail_tree_roots:
2129fail_dev_root: 2450 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2451
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2452fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2453 btrfs_stop_workers(&fs_info->generic_worker);
2454 btrfs_stop_workers(&fs_info->readahead_workers);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2455 btrfs_stop_workers(&fs_info->fixup_workers);
2144 btrfs_stop_workers(&fs_info->delalloc_workers); 2456 btrfs_stop_workers(&fs_info->delalloc_workers);
2145 btrfs_stop_workers(&fs_info->workers); 2457 btrfs_stop_workers(&fs_info->workers);
@@ -2152,7 +2464,6 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2464 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2465 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2466fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2467fail_iput:
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2468 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2469 iput(fs_info->btree_inode);
@@ -2164,13 +2475,27 @@ fail_bdi:
2164fail_srcu: 2475fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2476 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2477fail:
2167 kfree(extent_root); 2478 free_fs_info(fs_info);
2168 kfree(tree_root);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2479 return ERR_PTR(err);
2480
2481recovery_tree_root:
2482
2483 if (!btrfs_test_opt(tree_root, RECOVERY))
2484 goto fail_tree_roots;
2485
2486 free_root_pointers(fs_info, 0);
2487
2488 /* don't use the log in recovery mode, it won't be valid */
2489 btrfs_set_super_log_root(disk_super, 0);
2490
2491 /* we can't trust the free space cache either */
2492 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2493
2494 ret = next_root_backup(fs_info, fs_info->super_copy,
2495 &num_backups_tried, &backup_index);
2496 if (ret == -1)
2497 goto fail_block_groups;
2498 goto retry_root_backup;
2174} 2499}
2175 2500
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2501static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2338,10 +2663,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2663 int total_errors = 0;
2339 u64 flags; 2664 u64 flags;
2340 2665
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2666 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2667 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2668 backup_super_roots(root->fs_info);
2343 2669
2344 sb = &root->fs_info->super_for_commit; 2670 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2671 dev_item = &sb->dev_item;
2346 2672
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2673 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2545,8 +2871,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2871 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2872 btrfs_run_defrag_inodes(root->fs_info);
2547 2873
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2874 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2875 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2876 *
@@ -2572,6 +2896,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 2896 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 2897 }
2574 2898
2899 btrfs_put_block_group_cache(fs_info);
2900
2575 kthread_stop(root->fs_info->transaction_kthread); 2901 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 2902 kthread_stop(root->fs_info->cleaner_kthread);
2577 2903
@@ -2603,7 +2929,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 2929 del_fs_roots(fs_info);
2604 2930
2605 iput(fs_info->btree_inode); 2931 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 2932
2608 btrfs_stop_workers(&fs_info->generic_worker); 2933 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 2934 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +2942,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 2942 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 2943 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 2944 btrfs_stop_workers(&fs_info->caching_workers);
2945 btrfs_stop_workers(&fs_info->readahead_workers);
2620 2946
2621 btrfs_close_devices(fs_info->fs_devices); 2947 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 2948 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +2950,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 2950 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 2951 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 2952
2627 kfree(fs_info->extent_root); 2953 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 2954
2634 return 0; 2955 return 0;
2635} 2956}
@@ -2735,7 +3056,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3056 return ret;
2736} 3057}
2737 3058
2738int btree_lock_page_hook(struct page *page) 3059static int btree_lock_page_hook(struct page *page, void *data,
3060 void (*flush_fn)(void *))
2739{ 3061{
2740 struct inode *inode = page->mapping->host; 3062 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3063 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3074,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3074 if (!eb)
2753 goto out; 3075 goto out;
2754 3076
2755 btrfs_tree_lock(eb); 3077 if (!btrfs_try_tree_write_lock(eb)) {
3078 flush_fn(data);
3079 btrfs_tree_lock(eb);
3080 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3081 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3082
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3083 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3092,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3092 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3093 free_extent_buffer(eb);
2769out: 3094out:
2770 lock_page(page); 3095 if (!trylock_page(page)) {
3096 flush_fn(data);
3097 lock_page(page);
3098 }
2771 return 0; 3099 return 0;
2772} 3100}
2773 3101
@@ -3123,6 +3451,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3451static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3452 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3453 .readpage_end_io_hook = btree_readpage_end_io_hook,
3454 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3455 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3456 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3457 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c9ee0e18bbdc..9879bd474632 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -465,7 +482,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 482 * we likely hold important locks.
466 */ 483 */
467 if (trans && (!trans->transaction->in_commit) && 484 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 485 (root && root != root->fs_info->tree_root) &&
486 btrfs_test_opt(root, SPACE_CACHE)) {
469 spin_lock(&cache->lock); 487 spin_lock(&cache->lock);
470 if (cache->cached != BTRFS_CACHE_NO) { 488 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock); 489 spin_unlock(&cache->lock);
@@ -1770,18 +1788,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1770{ 1788{
1771 int ret; 1789 int ret;
1772 u64 discarded_bytes = 0; 1790 u64 discarded_bytes = 0;
1773 struct btrfs_multi_bio *multi = NULL; 1791 struct btrfs_bio *bbio = NULL;
1774 1792
1775 1793
1776 /* Tell the block device(s) that the sectors can be discarded */ 1794 /* Tell the block device(s) that the sectors can be discarded */
1777 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1795 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1778 bytenr, &num_bytes, &multi, 0); 1796 bytenr, &num_bytes, &bbio, 0);
1779 if (!ret) { 1797 if (!ret) {
1780 struct btrfs_bio_stripe *stripe = multi->stripes; 1798 struct btrfs_bio_stripe *stripe = bbio->stripes;
1781 int i; 1799 int i;
1782 1800
1783 1801
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1802 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard) 1803 if (!stripe->dev->can_discard)
1786 continue; 1804 continue;
1787 1805
@@ -1800,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1800 */ 1818 */
1801 ret = 0; 1819 ret = 0;
1802 } 1820 }
1803 kfree(multi); 1821 kfree(bbio);
1804 } 1822 }
1805 1823
1806 if (actual_bytes) 1824 if (actual_bytes)
@@ -2700,6 +2718,13 @@ again:
2700 goto again; 2718 goto again;
2701 } 2719 }
2702 2720
2721 /* We've already setup this transaction, go ahead and exit */
2722 if (block_group->cache_generation == trans->transid &&
2723 i_size_read(inode)) {
2724 dcs = BTRFS_DC_SETUP;
2725 goto out_put;
2726 }
2727
2703 /* 2728 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2729 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2730 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2774,15 @@ again:
2749 if (!ret) 2774 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2775 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2776 btrfs_free_reserved_data_space(inode, num_pages);
2777
2752out_put: 2778out_put:
2753 iput(inode); 2779 iput(inode);
2754out_free: 2780out_free:
2755 btrfs_release_path(path); 2781 btrfs_release_path(path);
2756out: 2782out:
2757 spin_lock(&block_group->lock); 2783 spin_lock(&block_group->lock);
2784 if (!ret)
2785 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2786 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2787 spin_unlock(&block_group->lock);
2760 2788
@@ -3122,16 +3150,13 @@ commit_trans:
3122 return -ENOSPC; 3150 return -ENOSPC;
3123 } 3151 }
3124 data_sinfo->bytes_may_use += bytes; 3152 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3153 spin_unlock(&data_sinfo->lock);
3127 3154
3128 return 0; 3155 return 0;
3129} 3156}
3130 3157
3131/* 3158/*
3132 * called when we are clearing an delalloc extent from the 3159 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3160 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3161void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3162{
@@ -3144,7 +3169,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3169 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3170 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3171 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3172 spin_unlock(&data_sinfo->lock);
3149} 3173}
3150 3174
@@ -3165,6 +3189,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3189 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3190 int force)
3167{ 3191{
3192 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3193 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3194 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3195 u64 thresh;
@@ -3173,11 +3198,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3198 return 1;
3174 3199
3175 /* 3200 /*
3201 * We need to take into account the global rsv because for all intents
3202 * and purposes it's used space. Don't worry about locking the
3203 * global_rsv, it doesn't change except when the transaction commits.
3204 */
3205 num_allocated += global_rsv->size;
3206
3207 /*
3176 * in limited mode, we want to have some free space up to 3208 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3209 * about 1% of the FS size.
3178 */ 3210 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3211 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3212 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3213 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3214 div_factor_fine(thresh, 1));
3183 3215
@@ -3199,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3231 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3232 return 0;
3201 3233
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3234 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3235
3204 /* 256MB or 5% of the FS */ 3236 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3237 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3334,26 @@ out:
3302/* 3334/*
3303 * shrink metadata reservation for delalloc 3335 * shrink metadata reservation for delalloc
3304 */ 3336 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3337static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3338 bool wait_ordered)
3307{ 3339{
3308 struct btrfs_block_rsv *block_rsv; 3340 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3341 struct btrfs_space_info *space_info;
3342 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3343 u64 reserved;
3311 u64 max_reclaim; 3344 u64 max_reclaim;
3312 u64 reclaimed = 0; 3345 u64 reclaimed = 0;
3313 long time_left; 3346 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3347 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3348 int loops = 0;
3316 unsigned long progress; 3349 unsigned long progress;
3317 3350
3351 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3352 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3353 space_info = block_rsv->space_info;
3320 3354
3321 smp_mb(); 3355 smp_mb();
3322 reserved = space_info->bytes_reserved; 3356 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3357 progress = space_info->reservation_progress;
3324 3358
3325 if (reserved == 0) 3359 if (reserved == 0)
@@ -3334,7 +3368,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3368 }
3335 3369
3336 max_reclaim = min(reserved, to_reclaim); 3370 max_reclaim = min(reserved, to_reclaim);
3337 3371 nr_pages = max_t(unsigned long, nr_pages,
3372 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3373 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3374 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3375 smp_mb();
@@ -3344,9 +3379,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3344 WB_REASON_FS_FREE_SPACE); 3379 WB_REASON_FS_FREE_SPACE);
3345 3380
3346 spin_lock(&space_info->lock); 3381 spin_lock(&space_info->lock);
3347 if (reserved > space_info->bytes_reserved) 3382 if (reserved > space_info->bytes_may_use)
3348 reclaimed += reserved - space_info->bytes_reserved; 3383 reclaimed += reserved - space_info->bytes_may_use;
3349 reserved = space_info->bytes_reserved; 3384 reserved = space_info->bytes_may_use;
3350 spin_unlock(&space_info->lock); 3385 spin_unlock(&space_info->lock);
3351 3386
3352 loops++; 3387 loops++;
@@ -3357,11 +3392,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3357 if (trans && trans->transaction->blocked) 3392 if (trans && trans->transaction->blocked)
3358 return -EAGAIN; 3393 return -EAGAIN;
3359 3394
3360 time_left = schedule_timeout_interruptible(1); 3395 if (wait_ordered && !trans) {
3396 btrfs_wait_ordered_extents(root, 0, 0);
3397 } else {
3398 time_left = schedule_timeout_interruptible(1);
3361 3399
3362 /* We were interrupted, exit */ 3400 /* We were interrupted, exit */
3363 if (time_left) 3401 if (time_left)
3364 break; 3402 break;
3403 }
3365 3404
3366 /* we've kicked the IO a few times, if anything has been freed, 3405 /* we've kicked the IO a few times, if anything has been freed,
3367 * exit. There is no sense in looping here for a long time 3406 * exit. There is no sense in looping here for a long time
@@ -3376,34 +3415,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3376 } 3415 }
3377 3416
3378 } 3417 }
3379 if (reclaimed >= to_reclaim && !trans) 3418
3380 btrfs_wait_ordered_extents(root, 0, 0);
3381 return reclaimed >= to_reclaim; 3419 return reclaimed >= to_reclaim;
3382} 3420}
3383 3421
3384/* 3422/**
3385 * Retries tells us how many times we've called reserve_metadata_bytes. The 3423 * maybe_commit_transaction - possibly commit the transaction if its ok to
3386 * idea is if this is the first call (retries == 0) then we will add to our 3424 * @root - the root we're allocating for
3387 * reserved count if we can't make the allocation in order to hold our place 3425 * @bytes - the number of bytes we want to reserve
3388 * while we go and try and free up space. That way for retries > 1 we don't try 3426 * @force - force the commit
3389 * and add space, we just check to see if the amount of unused space is >= the
3390 * total space, meaning that our reservation is valid.
3391 * 3427 *
3392 * However if we don't intend to retry this reservation, pass -1 as retries so 3428 * This will check to make sure that committing the transaction will actually
3393 * that it short circuits this logic. 3429 * get us somewhere and then commit the transaction if it does. Otherwise it
3430 * will return -ENOSPC.
3394 */ 3431 */
3395static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3432static int may_commit_transaction(struct btrfs_root *root,
3396 struct btrfs_root *root, 3433 struct btrfs_space_info *space_info,
3434 u64 bytes, int force)
3435{
3436 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3437 struct btrfs_trans_handle *trans;
3438
3439 trans = (struct btrfs_trans_handle *)current->journal_info;
3440 if (trans)
3441 return -EAGAIN;
3442
3443 if (force)
3444 goto commit;
3445
3446 /* See if there is enough pinned space to make this reservation */
3447 spin_lock(&space_info->lock);
3448 if (space_info->bytes_pinned >= bytes) {
3449 spin_unlock(&space_info->lock);
3450 goto commit;
3451 }
3452 spin_unlock(&space_info->lock);
3453
3454 /*
3455 * See if there is some space in the delayed insertion reservation for
3456 * this reservation.
3457 */
3458 if (space_info != delayed_rsv->space_info)
3459 return -ENOSPC;
3460
3461 spin_lock(&delayed_rsv->lock);
3462 if (delayed_rsv->size < bytes) {
3463 spin_unlock(&delayed_rsv->lock);
3464 return -ENOSPC;
3465 }
3466 spin_unlock(&delayed_rsv->lock);
3467
3468commit:
3469 trans = btrfs_join_transaction(root);
3470 if (IS_ERR(trans))
3471 return -ENOSPC;
3472
3473 return btrfs_commit_transaction(trans, root);
3474}
3475
3476/**
3477 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3478 * @root - the root we're allocating for
3479 * @block_rsv - the block_rsv we're allocating for
3480 * @orig_bytes - the number of bytes we want
3481 * @flush - wether or not we can flush to make our reservation
3482 *
3483 * This will reserve orgi_bytes number of bytes from the space info associated
3484 * with the block_rsv. If there is not enough space it will make an attempt to
3485 * flush out space to make room. It will do this by flushing delalloc if
3486 * possible or committing the transaction. If flush is 0 then no attempts to
3487 * regain reservations will be made and this will fail if there is not enough
3488 * space already.
3489 */
3490static int reserve_metadata_bytes(struct btrfs_root *root,
3397 struct btrfs_block_rsv *block_rsv, 3491 struct btrfs_block_rsv *block_rsv,
3398 u64 orig_bytes, int flush) 3492 u64 orig_bytes, int flush)
3399{ 3493{
3400 struct btrfs_space_info *space_info = block_rsv->space_info; 3494 struct btrfs_space_info *space_info = block_rsv->space_info;
3401 u64 unused; 3495 u64 used;
3402 u64 num_bytes = orig_bytes; 3496 u64 num_bytes = orig_bytes;
3403 int retries = 0; 3497 int retries = 0;
3404 int ret = 0; 3498 int ret = 0;
3405 bool committed = false; 3499 bool committed = false;
3406 bool flushing = false; 3500 bool flushing = false;
3501 bool wait_ordered = false;
3407 3502
3408again: 3503again:
3409 ret = 0; 3504 ret = 0;
@@ -3420,7 +3515,7 @@ again:
3420 * deadlock since we are waiting for the flusher to finish, but 3515 * deadlock since we are waiting for the flusher to finish, but
3421 * hold the current transaction open. 3516 * hold the current transaction open.
3422 */ 3517 */
3423 if (trans) 3518 if (current->journal_info)
3424 return -EAGAIN; 3519 return -EAGAIN;
3425 ret = wait_event_interruptible(space_info->wait, 3520 ret = wait_event_interruptible(space_info->wait,
3426 !space_info->flush); 3521 !space_info->flush);
@@ -3432,9 +3527,9 @@ again:
3432 } 3527 }
3433 3528
3434 ret = -ENOSPC; 3529 ret = -ENOSPC;
3435 unused = space_info->bytes_used + space_info->bytes_reserved + 3530 used = space_info->bytes_used + space_info->bytes_reserved +
3436 space_info->bytes_pinned + space_info->bytes_readonly + 3531 space_info->bytes_pinned + space_info->bytes_readonly +
3437 space_info->bytes_may_use; 3532 space_info->bytes_may_use;
3438 3533
3439 /* 3534 /*
3440 * The idea here is that we've not already over-reserved the block group 3535 * The idea here is that we've not already over-reserved the block group
@@ -3443,10 +3538,9 @@ again:
3443 * lets start flushing stuff first and then come back and try to make 3538 * lets start flushing stuff first and then come back and try to make
3444 * our reservation. 3539 * our reservation.
3445 */ 3540 */
3446 if (unused <= space_info->total_bytes) { 3541 if (used <= space_info->total_bytes) {
3447 unused = space_info->total_bytes - unused; 3542 if (used + orig_bytes <= space_info->total_bytes) {
3448 if (unused >= num_bytes) { 3543 space_info->bytes_may_use += orig_bytes;
3449 space_info->bytes_reserved += orig_bytes;
3450 ret = 0; 3544 ret = 0;
3451 } else { 3545 } else {
3452 /* 3546 /*
@@ -3462,10 +3556,64 @@ again:
3462 * amount plus the amount of bytes that we need for this 3556 * amount plus the amount of bytes that we need for this
3463 * reservation. 3557 * reservation.
3464 */ 3558 */
3465 num_bytes = unused - space_info->total_bytes + 3559 wait_ordered = true;
3560 num_bytes = used - space_info->total_bytes +
3466 (orig_bytes * (retries + 1)); 3561 (orig_bytes * (retries + 1));
3467 } 3562 }
3468 3563
3564 if (ret) {
3565 u64 profile = btrfs_get_alloc_profile(root, 0);
3566 u64 avail;
3567
3568 /*
3569 * If we have a lot of space that's pinned, don't bother doing
3570 * the overcommit dance yet and just commit the transaction.
3571 */
3572 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3573 do_div(avail, 10);
3574 if (space_info->bytes_pinned >= avail && flush && !committed) {
3575 space_info->flush = 1;
3576 flushing = true;
3577 spin_unlock(&space_info->lock);
3578 ret = may_commit_transaction(root, space_info,
3579 orig_bytes, 1);
3580 if (ret)
3581 goto out;
3582 committed = true;
3583 goto again;
3584 }
3585
3586 spin_lock(&root->fs_info->free_chunk_lock);
3587 avail = root->fs_info->free_chunk_space;
3588
3589 /*
3590 * If we have dup, raid1 or raid10 then only half of the free
3591 * space is actually useable.
3592 */
3593 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3594 BTRFS_BLOCK_GROUP_RAID1 |
3595 BTRFS_BLOCK_GROUP_RAID10))
3596 avail >>= 1;
3597
3598 /*
3599 * If we aren't flushing don't let us overcommit too much, say
3600 * 1/8th of the space. If we can flush, let it overcommit up to
3601 * 1/2 of the space.
3602 */
3603 if (flush)
3604 avail >>= 3;
3605 else
3606 avail >>= 1;
3607 spin_unlock(&root->fs_info->free_chunk_lock);
3608
3609 if (used + num_bytes < space_info->total_bytes + avail) {
3610 space_info->bytes_may_use += orig_bytes;
3611 ret = 0;
3612 } else {
3613 wait_ordered = true;
3614 }
3615 }
3616
3469 /* 3617 /*
3470 * Couldn't make our reservation, save our place so while we're trying 3618 * Couldn't make our reservation, save our place so while we're trying
3471 * to reclaim space we can actually use it instead of somebody else 3619 * to reclaim space we can actually use it instead of somebody else
@@ -3485,7 +3633,7 @@ again:
3485 * We do synchronous shrinking since we don't actually unreserve 3633 * We do synchronous shrinking since we don't actually unreserve
3486 * metadata until after the IO is completed. 3634 * metadata until after the IO is completed.
3487 */ 3635 */
3488 ret = shrink_delalloc(trans, root, num_bytes, 1); 3636 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3489 if (ret < 0) 3637 if (ret < 0)
3490 goto out; 3638 goto out;
3491 3639
@@ -3497,35 +3645,17 @@ again:
3497 * so go back around and try again. 3645 * so go back around and try again.
3498 */ 3646 */
3499 if (retries < 2) { 3647 if (retries < 2) {
3648 wait_ordered = true;
3500 retries++; 3649 retries++;
3501 goto again; 3650 goto again;
3502 } 3651 }
3503 3652
3504 /*
3505 * Not enough space to be reclaimed, don't bother committing the
3506 * transaction.
3507 */
3508 spin_lock(&space_info->lock);
3509 if (space_info->bytes_pinned < orig_bytes)
3510 ret = -ENOSPC;
3511 spin_unlock(&space_info->lock);
3512 if (ret)
3513 goto out;
3514
3515 ret = -EAGAIN;
3516 if (trans)
3517 goto out;
3518
3519 ret = -ENOSPC; 3653 ret = -ENOSPC;
3520 if (committed) 3654 if (committed)
3521 goto out; 3655 goto out;
3522 3656
3523 trans = btrfs_join_transaction(root); 3657 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3524 if (IS_ERR(trans))
3525 goto out;
3526 ret = btrfs_commit_transaction(trans, root);
3527 if (!ret) { 3658 if (!ret) {
3528 trans = NULL;
3529 committed = true; 3659 committed = true;
3530 goto again; 3660 goto again;
3531 } 3661 }
@@ -3543,10 +3673,12 @@ out:
3543static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3673static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3544 struct btrfs_root *root) 3674 struct btrfs_root *root)
3545{ 3675{
3546 struct btrfs_block_rsv *block_rsv; 3676 struct btrfs_block_rsv *block_rsv = NULL;
3547 if (root->ref_cows) 3677
3678 if (root->ref_cows || root == root->fs_info->csum_root)
3548 block_rsv = trans->block_rsv; 3679 block_rsv = trans->block_rsv;
3549 else 3680
3681 if (!block_rsv)
3550 block_rsv = root->block_rsv; 3682 block_rsv = root->block_rsv;
3551 3683
3552 if (!block_rsv) 3684 if (!block_rsv)
@@ -3617,7 +3749,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3617 } 3749 }
3618 if (num_bytes) { 3750 if (num_bytes) {
3619 spin_lock(&space_info->lock); 3751 spin_lock(&space_info->lock);
3620 space_info->bytes_reserved -= num_bytes; 3752 space_info->bytes_may_use -= num_bytes;
3621 space_info->reservation_progress++; 3753 space_info->reservation_progress++;
3622 spin_unlock(&space_info->lock); 3754 spin_unlock(&space_info->lock);
3623 } 3755 }
@@ -3641,9 +3773,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3641{ 3773{
3642 memset(rsv, 0, sizeof(*rsv)); 3774 memset(rsv, 0, sizeof(*rsv));
3643 spin_lock_init(&rsv->lock); 3775 spin_lock_init(&rsv->lock);
3644 atomic_set(&rsv->usage, 1);
3645 rsv->priority = 6;
3646 INIT_LIST_HEAD(&rsv->list);
3647} 3776}
3648 3777
3649struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3778struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3664,38 +3793,38 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3664void btrfs_free_block_rsv(struct btrfs_root *root, 3793void btrfs_free_block_rsv(struct btrfs_root *root,
3665 struct btrfs_block_rsv *rsv) 3794 struct btrfs_block_rsv *rsv)
3666{ 3795{
3667 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3796 btrfs_block_rsv_release(root, rsv, (u64)-1);
3668 btrfs_block_rsv_release(root, rsv, (u64)-1); 3797 kfree(rsv);
3669 if (!rsv->durable)
3670 kfree(rsv);
3671 }
3672} 3798}
3673 3799
3674/* 3800int btrfs_block_rsv_add(struct btrfs_root *root,
3675 * make the block_rsv struct be able to capture freed space. 3801 struct btrfs_block_rsv *block_rsv,
3676 * the captured space will re-add to the the block_rsv struct 3802 u64 num_bytes)
3677 * after transaction commit
3678 */
3679void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3680 struct btrfs_block_rsv *block_rsv)
3681{ 3803{
3682 block_rsv->durable = 1; 3804 int ret;
3683 mutex_lock(&fs_info->durable_block_rsv_mutex); 3805
3684 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list); 3806 if (num_bytes == 0)
3685 mutex_unlock(&fs_info->durable_block_rsv_mutex); 3807 return 0;
3808
3809 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3810 if (!ret) {
3811 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3812 return 0;
3813 }
3814
3815 return ret;
3686} 3816}
3687 3817
3688int btrfs_block_rsv_add(struct btrfs_trans_handle *trans, 3818int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3689 struct btrfs_root *root, 3819 struct btrfs_block_rsv *block_rsv,
3690 struct btrfs_block_rsv *block_rsv, 3820 u64 num_bytes)
3691 u64 num_bytes)
3692{ 3821{
3693 int ret; 3822 int ret;
3694 3823
3695 if (num_bytes == 0) 3824 if (num_bytes == 0)
3696 return 0; 3825 return 0;
3697 3826
3698 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3827 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
3699 if (!ret) { 3828 if (!ret) {
3700 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3829 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3701 return 0; 3830 return 0;
@@ -3704,55 +3833,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3704 return ret; 3833 return ret;
3705} 3834}
3706 3835
3707int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3836int btrfs_block_rsv_check(struct btrfs_root *root,
3708 struct btrfs_root *root, 3837 struct btrfs_block_rsv *block_rsv, int min_factor)
3709 struct btrfs_block_rsv *block_rsv,
3710 u64 min_reserved, int min_factor)
3711{ 3838{
3712 u64 num_bytes = 0; 3839 u64 num_bytes = 0;
3713 int commit_trans = 0;
3714 int ret = -ENOSPC; 3840 int ret = -ENOSPC;
3715 3841
3716 if (!block_rsv) 3842 if (!block_rsv)
3717 return 0; 3843 return 0;
3718 3844
3719 spin_lock(&block_rsv->lock); 3845 spin_lock(&block_rsv->lock);
3720 if (min_factor > 0) 3846 num_bytes = div_factor(block_rsv->size, min_factor);
3721 num_bytes = div_factor(block_rsv->size, min_factor); 3847 if (block_rsv->reserved >= num_bytes)
3722 if (min_reserved > num_bytes) 3848 ret = 0;
3723 num_bytes = min_reserved; 3849 spin_unlock(&block_rsv->lock);
3724 3850
3725 if (block_rsv->reserved >= num_bytes) { 3851 return ret;
3852}
3853
3854int btrfs_block_rsv_refill(struct btrfs_root *root,
3855 struct btrfs_block_rsv *block_rsv,
3856 u64 min_reserved)
3857{
3858 u64 num_bytes = 0;
3859 int ret = -ENOSPC;
3860
3861 if (!block_rsv)
3862 return 0;
3863
3864 spin_lock(&block_rsv->lock);
3865 num_bytes = min_reserved;
3866 if (block_rsv->reserved >= num_bytes)
3726 ret = 0; 3867 ret = 0;
3727 } else { 3868 else
3728 num_bytes -= block_rsv->reserved; 3869 num_bytes -= block_rsv->reserved;
3729 if (block_rsv->durable &&
3730 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3731 commit_trans = 1;
3732 }
3733 spin_unlock(&block_rsv->lock); 3870 spin_unlock(&block_rsv->lock);
3871
3734 if (!ret) 3872 if (!ret)
3735 return 0; 3873 return 0;
3736 3874
3737 if (block_rsv->refill_used) { 3875 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
3738 ret = reserve_metadata_bytes(trans, root, block_rsv, 3876 if (!ret) {
3739 num_bytes, 0); 3877 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3740 if (!ret) {
3741 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3742 return 0;
3743 }
3744 }
3745
3746 if (commit_trans) {
3747 if (trans)
3748 return -EAGAIN;
3749 trans = btrfs_join_transaction(root);
3750 BUG_ON(IS_ERR(trans));
3751 ret = btrfs_commit_transaction(trans, root);
3752 return 0; 3878 return 0;
3753 } 3879 }
3754 3880
3755 return -ENOSPC; 3881 return ret;
3756} 3882}
3757 3883
3758int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3884int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3784,7 +3910,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3784 u64 num_bytes; 3910 u64 num_bytes;
3785 u64 meta_used; 3911 u64 meta_used;
3786 u64 data_used; 3912 u64 data_used;
3787 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3913 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3788 3914
3789 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3915 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3790 spin_lock(&sinfo->lock); 3916 spin_lock(&sinfo->lock);
@@ -3828,12 +3954,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3828 if (sinfo->total_bytes > num_bytes) { 3954 if (sinfo->total_bytes > num_bytes) {
3829 num_bytes = sinfo->total_bytes - num_bytes; 3955 num_bytes = sinfo->total_bytes - num_bytes;
3830 block_rsv->reserved += num_bytes; 3956 block_rsv->reserved += num_bytes;
3831 sinfo->bytes_reserved += num_bytes; 3957 sinfo->bytes_may_use += num_bytes;
3832 } 3958 }
3833 3959
3834 if (block_rsv->reserved >= block_rsv->size) { 3960 if (block_rsv->reserved >= block_rsv->size) {
3835 num_bytes = block_rsv->reserved - block_rsv->size; 3961 num_bytes = block_rsv->reserved - block_rsv->size;
3836 sinfo->bytes_reserved -= num_bytes; 3962 sinfo->bytes_may_use -= num_bytes;
3837 sinfo->reservation_progress++; 3963 sinfo->reservation_progress++;
3838 block_rsv->reserved = block_rsv->size; 3964 block_rsv->reserved = block_rsv->size;
3839 block_rsv->full = 1; 3965 block_rsv->full = 1;
@@ -3849,16 +3975,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3849 3975
3850 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 3976 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3851 fs_info->chunk_block_rsv.space_info = space_info; 3977 fs_info->chunk_block_rsv.space_info = space_info;
3852 fs_info->chunk_block_rsv.priority = 10;
3853 3978
3854 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 3979 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3855 fs_info->global_block_rsv.space_info = space_info; 3980 fs_info->global_block_rsv.space_info = space_info;
3856 fs_info->global_block_rsv.priority = 10;
3857 fs_info->global_block_rsv.refill_used = 1;
3858 fs_info->delalloc_block_rsv.space_info = space_info; 3981 fs_info->delalloc_block_rsv.space_info = space_info;
3859 fs_info->trans_block_rsv.space_info = space_info; 3982 fs_info->trans_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.space_info = space_info; 3983 fs_info->empty_block_rsv.space_info = space_info;
3861 fs_info->empty_block_rsv.priority = 10; 3984 fs_info->delayed_block_rsv.space_info = space_info;
3862 3985
3863 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 3986 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3864 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 3987 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3866,10 +3989,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3866 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 3989 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3867 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 3990 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3868 3991
3869 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3870
3871 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3872
3873 update_global_block_rsv(fs_info); 3992 update_global_block_rsv(fs_info);
3874} 3993}
3875 3994
@@ -3882,37 +4001,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3882 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4001 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4002 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3884 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4003 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3885} 4004 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3886 4005 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3887int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3888 struct btrfs_root *root,
3889 struct btrfs_block_rsv *rsv)
3890{
3891 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3892 u64 num_bytes;
3893 int ret;
3894
3895 /*
3896 * Truncate should be freeing data, but give us 2 items just in case it
3897 * needs to use some space. We may want to be smarter about this in the
3898 * future.
3899 */
3900 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3901
3902 /* We already have enough bytes, just return */
3903 if (rsv->reserved >= num_bytes)
3904 return 0;
3905
3906 num_bytes -= rsv->reserved;
3907
3908 /*
3909 * You should have reserved enough space before hand to do this, so this
3910 * should not fail.
3911 */
3912 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3913 BUG_ON(ret);
3914
3915 return 0;
3916} 4006}
3917 4007
3918void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4008void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3921,9 +4011,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3921 if (!trans->bytes_reserved) 4011 if (!trans->bytes_reserved)
3922 return; 4012 return;
3923 4013
3924 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4014 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3925 btrfs_block_rsv_release(root, trans->block_rsv,
3926 trans->bytes_reserved);
3927 trans->bytes_reserved = 0; 4015 trans->bytes_reserved = 0;
3928} 4016}
3929 4017
@@ -3965,11 +4053,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3965 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4053 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3966} 4054}
3967 4055
4056/**
4057 * drop_outstanding_extent - drop an outstanding extent
4058 * @inode: the inode we're dropping the extent for
4059 *
4060 * This is called when we are freeing up an outstanding extent, either called
4061 * after an error or after an extent is written. This will return the number of
4062 * reserved extents that need to be freed. This must be called with
4063 * BTRFS_I(inode)->lock held.
4064 */
3968static unsigned drop_outstanding_extent(struct inode *inode) 4065static unsigned drop_outstanding_extent(struct inode *inode)
3969{ 4066{
3970 unsigned dropped_extents = 0; 4067 unsigned dropped_extents = 0;
3971 4068
3972 spin_lock(&BTRFS_I(inode)->lock);
3973 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4069 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3974 BTRFS_I(inode)->outstanding_extents--; 4070 BTRFS_I(inode)->outstanding_extents--;
3975 4071
@@ -3979,19 +4075,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
3979 */ 4075 */
3980 if (BTRFS_I(inode)->outstanding_extents >= 4076 if (BTRFS_I(inode)->outstanding_extents >=
3981 BTRFS_I(inode)->reserved_extents) 4077 BTRFS_I(inode)->reserved_extents)
3982 goto out; 4078 return 0;
3983 4079
3984 dropped_extents = BTRFS_I(inode)->reserved_extents - 4080 dropped_extents = BTRFS_I(inode)->reserved_extents -
3985 BTRFS_I(inode)->outstanding_extents; 4081 BTRFS_I(inode)->outstanding_extents;
3986 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4082 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3987out:
3988 spin_unlock(&BTRFS_I(inode)->lock);
3989 return dropped_extents; 4083 return dropped_extents;
3990} 4084}
3991 4085
3992static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4086/**
4087 * calc_csum_metadata_size - return the amount of metada space that must be
4088 * reserved/free'd for the given bytes.
4089 * @inode: the inode we're manipulating
4090 * @num_bytes: the number of bytes in question
4091 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4092 *
4093 * This adjusts the number of csum_bytes in the inode and then returns the
4094 * correct amount of metadata that must either be reserved or freed. We
4095 * calculate how many checksums we can fit into one leaf and then divide the
4096 * number of bytes that will need to be checksumed by this value to figure out
4097 * how many checksums will be required. If we are adding bytes then the number
4098 * may go up and we will return the number of additional bytes that must be
4099 * reserved. If it is going down we will return the number of bytes that must
4100 * be freed.
4101 *
4102 * This must be called with BTRFS_I(inode)->lock held.
4103 */
4104static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4105 int reserve)
3993{ 4106{
3994 return num_bytes >>= 3; 4107 struct btrfs_root *root = BTRFS_I(inode)->root;
4108 u64 csum_size;
4109 int num_csums_per_leaf;
4110 int num_csums;
4111 int old_csums;
4112
4113 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4114 BTRFS_I(inode)->csum_bytes == 0)
4115 return 0;
4116
4117 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4118 if (reserve)
4119 BTRFS_I(inode)->csum_bytes += num_bytes;
4120 else
4121 BTRFS_I(inode)->csum_bytes -= num_bytes;
4122 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4123 num_csums_per_leaf = (int)div64_u64(csum_size,
4124 sizeof(struct btrfs_csum_item) +
4125 sizeof(struct btrfs_disk_key));
4126 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4127 num_csums = num_csums + num_csums_per_leaf - 1;
4128 num_csums = num_csums / num_csums_per_leaf;
4129
4130 old_csums = old_csums + num_csums_per_leaf - 1;
4131 old_csums = old_csums / num_csums_per_leaf;
4132
4133 /* No change, no need to reserve more */
4134 if (old_csums == num_csums)
4135 return 0;
4136
4137 if (reserve)
4138 return btrfs_calc_trans_metadata_size(root,
4139 num_csums - old_csums);
4140
4141 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3995} 4142}
3996 4143
3997int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4144int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -4000,9 +4147,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4000 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4147 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4001 u64 to_reserve = 0; 4148 u64 to_reserve = 0;
4002 unsigned nr_extents = 0; 4149 unsigned nr_extents = 0;
4150 int flush = 1;
4003 int ret; 4151 int ret;
4004 4152
4005 if (btrfs_transaction_in_commit(root->fs_info)) 4153 if (btrfs_is_free_space_inode(root, inode))
4154 flush = 0;
4155
4156 if (flush && btrfs_transaction_in_commit(root->fs_info))
4006 schedule_timeout(1); 4157 schedule_timeout(1);
4007 4158
4008 num_bytes = ALIGN(num_bytes, root->sectorsize); 4159 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4018,18 +4169,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4018 4169
4019 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4170 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4020 } 4171 }
4172 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4021 spin_unlock(&BTRFS_I(inode)->lock); 4173 spin_unlock(&BTRFS_I(inode)->lock);
4022 4174
4023 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4175 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4024 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4025 if (ret) { 4176 if (ret) {
4177 u64 to_free = 0;
4026 unsigned dropped; 4178 unsigned dropped;
4179
4180 spin_lock(&BTRFS_I(inode)->lock);
4181 dropped = drop_outstanding_extent(inode);
4182 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4183 spin_unlock(&BTRFS_I(inode)->lock);
4184 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4185
4027 /* 4186 /*
4028 * We don't need the return value since our reservation failed, 4187 * Somebody could have come in and twiddled with the
4029 * we just need to clean up our counter. 4188 * reservation, so if we have to free more than we would have
4189 * reserved from this reservation go ahead and release those
4190 * bytes.
4030 */ 4191 */
4031 dropped = drop_outstanding_extent(inode); 4192 to_free -= to_reserve;
4032 WARN_ON(dropped > 1); 4193 if (to_free)
4194 btrfs_block_rsv_release(root, block_rsv, to_free);
4033 return ret; 4195 return ret;
4034 } 4196 }
4035 4197
@@ -4038,6 +4200,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4038 return 0; 4200 return 0;
4039} 4201}
4040 4202
4203/**
4204 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4205 * @inode: the inode to release the reservation for
4206 * @num_bytes: the number of bytes we're releasing
4207 *
4208 * This will release the metadata reservation for an inode. This can be called
4209 * once we complete IO for a given set of bytes to release their metadata
4210 * reservations.
4211 */
4041void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4212void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4042{ 4213{
4043 struct btrfs_root *root = BTRFS_I(inode)->root; 4214 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4045,9 +4216,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4045 unsigned dropped; 4216 unsigned dropped;
4046 4217
4047 num_bytes = ALIGN(num_bytes, root->sectorsize); 4218 num_bytes = ALIGN(num_bytes, root->sectorsize);
4219 spin_lock(&BTRFS_I(inode)->lock);
4048 dropped = drop_outstanding_extent(inode); 4220 dropped = drop_outstanding_extent(inode);
4049 4221
4050 to_free = calc_csum_metadata_size(inode, num_bytes); 4222 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4223 spin_unlock(&BTRFS_I(inode)->lock);
4051 if (dropped > 0) 4224 if (dropped > 0)
4052 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4225 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4053 4226
@@ -4055,6 +4228,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4055 to_free); 4228 to_free);
4056} 4229}
4057 4230
4231/**
4232 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4233 * @inode: inode we're writing to
4234 * @num_bytes: the number of bytes we want to allocate
4235 *
4236 * This will do the following things
4237 *
4238 * o reserve space in the data space info for num_bytes
4239 * o reserve space in the metadata space info based on number of outstanding
4240 * extents and how much csums will be needed
4241 * o add to the inodes ->delalloc_bytes
4242 * o add it to the fs_info's delalloc inodes list.
4243 *
4244 * This will return 0 for success and -ENOSPC if there is no space left.
4245 */
4058int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4246int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4059{ 4247{
4060 int ret; 4248 int ret;
@@ -4072,6 +4260,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4072 return 0; 4260 return 0;
4073} 4261}
4074 4262
4263/**
4264 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4265 * @inode: inode we're releasing space for
4266 * @num_bytes: the number of bytes we want to free up
4267 *
4268 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4269 * called in the case that we don't need the metadata AND data reservations
4270 * anymore. So if there is an error or we insert an inline extent.
4271 *
4272 * This function will release the metadata space that was not used and will
4273 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4274 * list if there are no delalloc bytes left.
4275 */
4075void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4276void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4076{ 4277{
4077 btrfs_delalloc_release_metadata(inode, num_bytes); 4278 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4091,12 +4292,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4091 4292
4092 /* block accounting for super block */ 4293 /* block accounting for super block */
4093 spin_lock(&info->delalloc_lock); 4294 spin_lock(&info->delalloc_lock);
4094 old_val = btrfs_super_bytes_used(&info->super_copy); 4295 old_val = btrfs_super_bytes_used(info->super_copy);
4095 if (alloc) 4296 if (alloc)
4096 old_val += num_bytes; 4297 old_val += num_bytes;
4097 else 4298 else
4098 old_val -= num_bytes; 4299 old_val -= num_bytes;
4099 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4300 btrfs_set_super_bytes_used(info->super_copy, old_val);
4100 spin_unlock(&info->delalloc_lock); 4301 spin_unlock(&info->delalloc_lock);
4101 4302
4102 while (total) { 4303 while (total) {
@@ -4124,7 +4325,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4124 spin_lock(&cache->space_info->lock); 4325 spin_lock(&cache->space_info->lock);
4125 spin_lock(&cache->lock); 4326 spin_lock(&cache->lock);
4126 4327
4127 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4328 if (btrfs_test_opt(root, SPACE_CACHE) &&
4128 cache->disk_cache_state < BTRFS_DC_CLEAR) 4329 cache->disk_cache_state < BTRFS_DC_CLEAR)
4129 cache->disk_cache_state = BTRFS_DC_CLEAR; 4330 cache->disk_cache_state = BTRFS_DC_CLEAR;
4130 4331
@@ -4136,7 +4337,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4136 btrfs_set_block_group_used(&cache->item, old_val); 4337 btrfs_set_block_group_used(&cache->item, old_val);
4137 cache->reserved -= num_bytes; 4338 cache->reserved -= num_bytes;
4138 cache->space_info->bytes_reserved -= num_bytes; 4339 cache->space_info->bytes_reserved -= num_bytes;
4139 cache->space_info->reservation_progress++;
4140 cache->space_info->bytes_used += num_bytes; 4340 cache->space_info->bytes_used += num_bytes;
4141 cache->space_info->disk_used += num_bytes * factor; 4341 cache->space_info->disk_used += num_bytes * factor;
4142 spin_unlock(&cache->lock); 4342 spin_unlock(&cache->lock);
@@ -4188,7 +4388,6 @@ static int pin_down_extent(struct btrfs_root *root,
4188 if (reserved) { 4388 if (reserved) {
4189 cache->reserved -= num_bytes; 4389 cache->reserved -= num_bytes;
4190 cache->space_info->bytes_reserved -= num_bytes; 4390 cache->space_info->bytes_reserved -= num_bytes;
4191 cache->space_info->reservation_progress++;
4192 } 4391 }
4193 spin_unlock(&cache->lock); 4392 spin_unlock(&cache->lock);
4194 spin_unlock(&cache->space_info->lock); 4393 spin_unlock(&cache->space_info->lock);
@@ -4216,45 +4415,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4216} 4415}
4217 4416
4218/* 4417/*
4219 * update size of reserved extents. this function may return -EAGAIN 4418 * this function must be called within transaction
4220 * if 'reserve' is true or 'sinfo' is false. 4419 */
4420int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4421 struct btrfs_root *root,
4422 u64 bytenr, u64 num_bytes)
4423{
4424 struct btrfs_block_group_cache *cache;
4425
4426 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4427 BUG_ON(!cache);
4428
4429 /*
4430 * pull in the free space cache (if any) so that our pin
4431 * removes the free space from the cache. We have load_only set
4432 * to one because the slow code to read in the free extents does check
4433 * the pinned extents.
4434 */
4435 cache_block_group(cache, trans, root, 1);
4436
4437 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4438
4439 /* remove us from the free space cache (if we're there at all) */
4440 btrfs_remove_free_space(cache, bytenr, num_bytes);
4441 btrfs_put_block_group(cache);
4442 return 0;
4443}
4444
4445/**
4446 * btrfs_update_reserved_bytes - update the block_group and space info counters
4447 * @cache: The cache we are manipulating
4448 * @num_bytes: The number of bytes in question
4449 * @reserve: One of the reservation enums
4450 *
4451 * This is called by the allocator when it reserves space, or by somebody who is
4452 * freeing space that was never actually used on disk. For example if you
4453 * reserve some space for a new leaf in transaction A and before transaction A
4454 * commits you free that leaf, you call this with reserve set to 0 in order to
4455 * clear the reservation.
4456 *
4457 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4458 * ENOSPC accounting. For data we handle the reservation through clearing the
4459 * delalloc bits in the io_tree. We have to do this since we could end up
4460 * allocating less disk space for the amount of data we have reserved in the
4461 * case of compression.
4462 *
4463 * If this is a reservation and the block group has become read only we cannot
4464 * make the reservation and return -EAGAIN, otherwise this function always
4465 * succeeds.
4221 */ 4466 */
4222int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4467static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4223 u64 num_bytes, int reserve, int sinfo) 4468 u64 num_bytes, int reserve)
4224{ 4469{
4470 struct btrfs_space_info *space_info = cache->space_info;
4225 int ret = 0; 4471 int ret = 0;
4226 if (sinfo) { 4472 spin_lock(&space_info->lock);
4227 struct btrfs_space_info *space_info = cache->space_info; 4473 spin_lock(&cache->lock);
4228 spin_lock(&space_info->lock); 4474 if (reserve != RESERVE_FREE) {
4229 spin_lock(&cache->lock);
4230 if (reserve) {
4231 if (cache->ro) {
4232 ret = -EAGAIN;
4233 } else {
4234 cache->reserved += num_bytes;
4235 space_info->bytes_reserved += num_bytes;
4236 }
4237 } else {
4238 if (cache->ro)
4239 space_info->bytes_readonly += num_bytes;
4240 cache->reserved -= num_bytes;
4241 space_info->bytes_reserved -= num_bytes;
4242 space_info->reservation_progress++;
4243 }
4244 spin_unlock(&cache->lock);
4245 spin_unlock(&space_info->lock);
4246 } else {
4247 spin_lock(&cache->lock);
4248 if (cache->ro) { 4475 if (cache->ro) {
4249 ret = -EAGAIN; 4476 ret = -EAGAIN;
4250 } else { 4477 } else {
4251 if (reserve) 4478 cache->reserved += num_bytes;
4252 cache->reserved += num_bytes; 4479 space_info->bytes_reserved += num_bytes;
4253 else 4480 if (reserve == RESERVE_ALLOC) {
4254 cache->reserved -= num_bytes; 4481 BUG_ON(space_info->bytes_may_use < num_bytes);
4482 space_info->bytes_may_use -= num_bytes;
4483 }
4255 } 4484 }
4256 spin_unlock(&cache->lock); 4485 } else {
4486 if (cache->ro)
4487 space_info->bytes_readonly += num_bytes;
4488 cache->reserved -= num_bytes;
4489 space_info->bytes_reserved -= num_bytes;
4490 space_info->reservation_progress++;
4257 } 4491 }
4492 spin_unlock(&cache->lock);
4493 spin_unlock(&space_info->lock);
4258 return ret; 4494 return ret;
4259} 4495}
4260 4496
@@ -4320,13 +4556,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4320 spin_lock(&cache->lock); 4556 spin_lock(&cache->lock);
4321 cache->pinned -= len; 4557 cache->pinned -= len;
4322 cache->space_info->bytes_pinned -= len; 4558 cache->space_info->bytes_pinned -= len;
4323 if (cache->ro) { 4559 if (cache->ro)
4324 cache->space_info->bytes_readonly += len; 4560 cache->space_info->bytes_readonly += len;
4325 } else if (cache->reserved_pinned > 0) {
4326 len = min(len, cache->reserved_pinned);
4327 cache->reserved_pinned -= len;
4328 cache->space_info->bytes_reserved += len;
4329 }
4330 spin_unlock(&cache->lock); 4561 spin_unlock(&cache->lock);
4331 spin_unlock(&cache->space_info->lock); 4562 spin_unlock(&cache->space_info->lock);
4332 } 4563 }
@@ -4341,11 +4572,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4341{ 4572{
4342 struct btrfs_fs_info *fs_info = root->fs_info; 4573 struct btrfs_fs_info *fs_info = root->fs_info;
4343 struct extent_io_tree *unpin; 4574 struct extent_io_tree *unpin;
4344 struct btrfs_block_rsv *block_rsv;
4345 struct btrfs_block_rsv *next_rsv;
4346 u64 start; 4575 u64 start;
4347 u64 end; 4576 u64 end;
4348 int idx;
4349 int ret; 4577 int ret;
4350 4578
4351 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4579 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4368,30 +4596,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4368 cond_resched(); 4596 cond_resched();
4369 } 4597 }
4370 4598
4371 mutex_lock(&fs_info->durable_block_rsv_mutex);
4372 list_for_each_entry_safe(block_rsv, next_rsv,
4373 &fs_info->durable_block_rsv_list, list) {
4374
4375 idx = trans->transid & 0x1;
4376 if (block_rsv->freed[idx] > 0) {
4377 block_rsv_add_bytes(block_rsv,
4378 block_rsv->freed[idx], 0);
4379 block_rsv->freed[idx] = 0;
4380 }
4381 if (atomic_read(&block_rsv->usage) == 0) {
4382 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4383
4384 if (block_rsv->freed[0] == 0 &&
4385 block_rsv->freed[1] == 0) {
4386 list_del_init(&block_rsv->list);
4387 kfree(block_rsv);
4388 }
4389 } else {
4390 btrfs_block_rsv_release(root, block_rsv, 0);
4391 }
4392 }
4393 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4394
4395 return 0; 4599 return 0;
4396} 4600}
4397 4601
@@ -4669,7 +4873,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4669 struct extent_buffer *buf, 4873 struct extent_buffer *buf,
4670 u64 parent, int last_ref) 4874 u64 parent, int last_ref)
4671{ 4875{
4672 struct btrfs_block_rsv *block_rsv;
4673 struct btrfs_block_group_cache *cache = NULL; 4876 struct btrfs_block_group_cache *cache = NULL;
4674 int ret; 4877 int ret;
4675 4878
@@ -4684,64 +4887,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4684 if (!last_ref) 4887 if (!last_ref)
4685 return; 4888 return;
4686 4889
4687 block_rsv = get_block_rsv(trans, root);
4688 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4890 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4689 if (block_rsv->space_info != cache->space_info)
4690 goto out;
4691 4891
4692 if (btrfs_header_generation(buf) == trans->transid) { 4892 if (btrfs_header_generation(buf) == trans->transid) {
4693 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4893 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4694 ret = check_ref_cleanup(trans, root, buf->start); 4894 ret = check_ref_cleanup(trans, root, buf->start);
4695 if (!ret) 4895 if (!ret)
4696 goto pin; 4896 goto out;
4697 } 4897 }
4698 4898
4699 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4899 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4700 pin_down_extent(root, cache, buf->start, buf->len, 1); 4900 pin_down_extent(root, cache, buf->start, buf->len, 1);
4701 goto pin; 4901 goto out;
4702 } 4902 }
4703 4903
4704 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4904 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4705 4905
4706 btrfs_add_free_space(cache, buf->start, buf->len); 4906 btrfs_add_free_space(cache, buf->start, buf->len);
4707 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4907 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4708 if (ret == -EAGAIN) {
4709 /* block group became read-only */
4710 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4711 goto out;
4712 }
4713
4714 ret = 1;
4715 spin_lock(&block_rsv->lock);
4716 if (block_rsv->reserved < block_rsv->size) {
4717 block_rsv->reserved += buf->len;
4718 ret = 0;
4719 }
4720 spin_unlock(&block_rsv->lock);
4721
4722 if (ret) {
4723 spin_lock(&cache->space_info->lock);
4724 cache->space_info->bytes_reserved -= buf->len;
4725 cache->space_info->reservation_progress++;
4726 spin_unlock(&cache->space_info->lock);
4727 }
4728 goto out;
4729 }
4730pin:
4731 if (block_rsv->durable && !cache->ro) {
4732 ret = 0;
4733 spin_lock(&cache->lock);
4734 if (!cache->ro) {
4735 cache->reserved_pinned += buf->len;
4736 ret = 1;
4737 }
4738 spin_unlock(&cache->lock);
4739
4740 if (ret) {
4741 spin_lock(&block_rsv->lock);
4742 block_rsv->freed[trans->transid & 0x1] += buf->len;
4743 spin_unlock(&block_rsv->lock);
4744 }
4745 } 4908 }
4746out: 4909out:
4747 /* 4910 /*
@@ -4884,10 +5047,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4884 int last_ptr_loop = 0; 5047 int last_ptr_loop = 0;
4885 int loop = 0; 5048 int loop = 0;
4886 int index = 0; 5049 int index = 0;
5050 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5051 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4887 bool found_uncached_bg = false; 5052 bool found_uncached_bg = false;
4888 bool failed_cluster_refill = false; 5053 bool failed_cluster_refill = false;
4889 bool failed_alloc = false; 5054 bool failed_alloc = false;
4890 bool use_cluster = true; 5055 bool use_cluster = true;
5056 bool have_caching_bg = false;
4891 u64 ideal_cache_percent = 0; 5057 u64 ideal_cache_percent = 0;
4892 u64 ideal_cache_offset = 0; 5058 u64 ideal_cache_offset = 0;
4893 5059
@@ -4970,6 +5136,7 @@ ideal_cache:
4970 } 5136 }
4971 } 5137 }
4972search: 5138search:
5139 have_caching_bg = false;
4973 down_read(&space_info->groups_sem); 5140 down_read(&space_info->groups_sem);
4974 list_for_each_entry(block_group, &space_info->block_groups[index], 5141 list_for_each_entry(block_group, &space_info->block_groups[index],
4975 list) { 5142 list) {
@@ -5178,6 +5345,8 @@ refill_cluster:
5178 failed_alloc = true; 5345 failed_alloc = true;
5179 goto have_block_group; 5346 goto have_block_group;
5180 } else if (!offset) { 5347 } else if (!offset) {
5348 if (!cached)
5349 have_caching_bg = true;
5181 goto loop; 5350 goto loop;
5182 } 5351 }
5183checks: 5352checks:
@@ -5203,8 +5372,8 @@ checks:
5203 search_start - offset); 5372 search_start - offset);
5204 BUG_ON(offset > search_start); 5373 BUG_ON(offset > search_start);
5205 5374
5206 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5375 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5207 (data & BTRFS_BLOCK_GROUP_DATA)); 5376 alloc_type);
5208 if (ret == -EAGAIN) { 5377 if (ret == -EAGAIN) {
5209 btrfs_add_free_space(block_group, offset, num_bytes); 5378 btrfs_add_free_space(block_group, offset, num_bytes);
5210 goto loop; 5379 goto loop;
@@ -5228,6 +5397,9 @@ loop:
5228 } 5397 }
5229 up_read(&space_info->groups_sem); 5398 up_read(&space_info->groups_sem);
5230 5399
5400 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5401 goto search;
5402
5231 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5403 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5232 goto search; 5404 goto search;
5233 5405
@@ -5326,7 +5498,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5326 int index = 0; 5498 int index = 0;
5327 5499
5328 spin_lock(&info->lock); 5500 spin_lock(&info->lock);
5329 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5501 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5502 (unsigned long long)info->flags,
5330 (unsigned long long)(info->total_bytes - info->bytes_used - 5503 (unsigned long long)(info->total_bytes - info->bytes_used -
5331 info->bytes_pinned - info->bytes_reserved - 5504 info->bytes_pinned - info->bytes_reserved -
5332 info->bytes_readonly), 5505 info->bytes_readonly),
@@ -5412,7 +5585,8 @@ again:
5412 return ret; 5585 return ret;
5413} 5586}
5414 5587
5415int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5588static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5589 u64 start, u64 len, int pin)
5416{ 5590{
5417 struct btrfs_block_group_cache *cache; 5591 struct btrfs_block_group_cache *cache;
5418 int ret = 0; 5592 int ret = 0;
@@ -5427,8 +5601,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5427 if (btrfs_test_opt(root, DISCARD)) 5601 if (btrfs_test_opt(root, DISCARD))
5428 ret = btrfs_discard_extent(root, start, len, NULL); 5602 ret = btrfs_discard_extent(root, start, len, NULL);
5429 5603
5430 btrfs_add_free_space(cache, start, len); 5604 if (pin)
5431 btrfs_update_reserved_bytes(cache, len, 0, 1); 5605 pin_down_extent(root, cache, start, len, 1);
5606 else {
5607 btrfs_add_free_space(cache, start, len);
5608 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5609 }
5432 btrfs_put_block_group(cache); 5610 btrfs_put_block_group(cache);
5433 5611
5434 trace_btrfs_reserved_extent_free(root, start, len); 5612 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5436,6 +5614,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5436 return ret; 5614 return ret;
5437} 5615}
5438 5616
5617int btrfs_free_reserved_extent(struct btrfs_root *root,
5618 u64 start, u64 len)
5619{
5620 return __btrfs_free_reserved_extent(root, start, len, 0);
5621}
5622
5623int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5624 u64 start, u64 len)
5625{
5626 return __btrfs_free_reserved_extent(root, start, len, 1);
5627}
5628
5439static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5629static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5440 struct btrfs_root *root, 5630 struct btrfs_root *root,
5441 u64 parent, u64 root_objectid, 5631 u64 parent, u64 root_objectid,
@@ -5631,7 +5821,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5631 put_caching_control(caching_ctl); 5821 put_caching_control(caching_ctl);
5632 } 5822 }
5633 5823
5634 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5824 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5825 RESERVE_ALLOC_NO_ACCOUNT);
5635 BUG_ON(ret); 5826 BUG_ON(ret);
5636 btrfs_put_block_group(block_group); 5827 btrfs_put_block_group(block_group);
5637 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5828 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5688,8 +5879,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5688 block_rsv = get_block_rsv(trans, root); 5879 block_rsv = get_block_rsv(trans, root);
5689 5880
5690 if (block_rsv->size == 0) { 5881 if (block_rsv->size == 0) {
5691 ret = reserve_metadata_bytes(trans, root, block_rsv, 5882 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5692 blocksize, 0);
5693 /* 5883 /*
5694 * If we couldn't reserve metadata bytes try and use some from 5884 * If we couldn't reserve metadata bytes try and use some from
5695 * the global reserve. 5885 * the global reserve.
@@ -5709,13 +5899,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5709 if (!ret) 5899 if (!ret)
5710 return block_rsv; 5900 return block_rsv;
5711 if (ret) { 5901 if (ret) {
5712 WARN_ON(1); 5902 static DEFINE_RATELIMIT_STATE(_rs,
5713 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5903 DEFAULT_RATELIMIT_INTERVAL,
5714 0); 5904 /*DEFAULT_RATELIMIT_BURST*/ 2);
5905 if (__ratelimit(&_rs)) {
5906 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5907 WARN_ON(1);
5908 }
5909 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5715 if (!ret) { 5910 if (!ret) {
5716 spin_lock(&block_rsv->lock);
5717 block_rsv->size += blocksize;
5718 spin_unlock(&block_rsv->lock);
5719 return block_rsv; 5911 return block_rsv;
5720 } else if (ret && block_rsv != global_rsv) { 5912 } else if (ret && block_rsv != global_rsv) {
5721 ret = block_rsv_use_bytes(global_rsv, blocksize); 5913 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6593,12 +6785,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6593 cache->bytes_super - btrfs_block_group_used(&cache->item); 6785 cache->bytes_super - btrfs_block_group_used(&cache->item);
6594 6786
6595 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6787 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6596 sinfo->bytes_may_use + sinfo->bytes_readonly + 6788 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6597 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6789 min_allocable_bytes <= sinfo->total_bytes) {
6598 sinfo->total_bytes) {
6599 sinfo->bytes_readonly += num_bytes; 6790 sinfo->bytes_readonly += num_bytes;
6600 sinfo->bytes_reserved += cache->reserved_pinned;
6601 cache->reserved_pinned = 0;
6602 cache->ro = 1; 6791 cache->ro = 1;
6603 ret = 0; 6792 ret = 0;
6604 } 6793 }
@@ -6965,7 +7154,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6965 struct btrfs_space_info, 7154 struct btrfs_space_info,
6966 list); 7155 list);
6967 if (space_info->bytes_pinned > 0 || 7156 if (space_info->bytes_pinned > 0 ||
6968 space_info->bytes_reserved > 0) { 7157 space_info->bytes_reserved > 0 ||
7158 space_info->bytes_may_use > 0) {
6969 WARN_ON(1); 7159 WARN_ON(1);
6970 dump_space_info(space_info, 0, 0); 7160 dump_space_info(space_info, 0, 0);
6971 } 7161 }
@@ -7007,14 +7197,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7007 return -ENOMEM; 7197 return -ENOMEM;
7008 path->reada = 1; 7198 path->reada = 1;
7009 7199
7010 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7200 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7011 if (cache_gen != 0 && 7201 if (btrfs_test_opt(root, SPACE_CACHE) &&
7012 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7202 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7013 need_clear = 1; 7203 need_clear = 1;
7014 if (btrfs_test_opt(root, CLEAR_CACHE)) 7204 if (btrfs_test_opt(root, CLEAR_CACHE))
7015 need_clear = 1; 7205 need_clear = 1;
7016 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7017 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7018 7206
7019 while (1) { 7207 while (1) {
7020 ret = find_first_block_group(root, path, &key); 7208 ret = find_first_block_group(root, path, &key);
@@ -7253,7 +7441,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7253 goto out; 7441 goto out;
7254 } 7442 }
7255 7443
7256 inode = lookup_free_space_inode(root, block_group, path); 7444 inode = lookup_free_space_inode(tree_root, block_group, path);
7257 if (!IS_ERR(inode)) { 7445 if (!IS_ERR(inode)) {
7258 ret = btrfs_orphan_add(trans, inode); 7446 ret = btrfs_orphan_add(trans, inode);
7259 BUG_ON(ret); 7447 BUG_ON(ret);
@@ -7269,7 +7457,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7269 spin_unlock(&block_group->lock); 7457 spin_unlock(&block_group->lock);
7270 } 7458 }
7271 /* One for our lookup ref */ 7459 /* One for our lookup ref */
7272 iput(inode); 7460 btrfs_add_delayed_iput(inode);
7273 } 7461 }
7274 7462
7275 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7463 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7340,7 +7528,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7340 int mixed = 0; 7528 int mixed = 0;
7341 int ret; 7529 int ret;
7342 7530
7343 disk_super = &fs_info->super_copy; 7531 disk_super = fs_info->super_copy;
7344 if (!btrfs_super_root(disk_super)) 7532 if (!btrfs_super_root(disk_super))
7345 return 1; 7533 return 1;
7346 7534
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..1f87c4d0e7a0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,194 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc)
939 return -ENOMEM;
940 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL;
942 BUG_ON(err == -EEXIST);
943 goto out;
944 }
945 state = rb_entry(node, struct extent_state, rb_node);
946hit_next:
947 last_start = state->start;
948 last_end = state->end;
949
950 /*
951 * | ---- desired range ---- |
952 * | state |
953 *
954 * Just lock what we found and keep going
955 */
956 if (state->start == start && state->end <= end) {
957 struct rb_node *next_node;
958
959 set_state_bits(tree, state, &bits);
960 clear_state_bit(tree, state, &clear_bits, 0);
961
962 merge_state(tree, state);
963 if (last_end == (u64)-1)
964 goto out;
965
966 start = last_end + 1;
967 next_node = rb_next(&state->rb_node);
968 if (next_node && start < end && prealloc && !need_resched()) {
969 state = rb_entry(next_node, struct extent_state,
970 rb_node);
971 if (state->start == start)
972 goto hit_next;
973 }
974 goto search_again;
975 }
976
977 /*
978 * | ---- desired range ---- |
979 * | state |
980 * or
981 * | ------------- state -------------- |
982 *
983 * We need to split the extent we found, and may flip bits on
984 * second half.
985 *
986 * If the extent we found extends past our
987 * range, we just split and search again. It'll get split
988 * again the next time though.
989 *
990 * If the extent we found is inside our range, we set the
991 * desired bit on it.
992 */
993 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc)
996 return -ENOMEM;
997 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST);
999 prealloc = NULL;
1000 if (err)
1001 goto out;
1002 if (state->end <= end) {
1003 set_state_bits(tree, state, &bits);
1004 clear_state_bit(tree, state, &clear_bits, 0);
1005 merge_state(tree, state);
1006 if (last_end == (u64)-1)
1007 goto out;
1008 start = last_end + 1;
1009 }
1010 goto search_again;
1011 }
1012 /*
1013 * | ---- desired range ---- |
1014 * | state | or | state |
1015 *
1016 * There's a hole, we need to insert something in it and
1017 * ignore the extent we found.
1018 */
1019 if (state->start > start) {
1020 u64 this_end;
1021 if (end < last_start)
1022 this_end = end;
1023 else
1024 this_end = last_start - 1;
1025
1026 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc)
1028 return -ENOMEM;
1029
1030 /*
1031 * Avoid to free 'prealloc' if it can be merged with
1032 * the later extent.
1033 */
1034 err = insert_state(tree, prealloc, start, this_end,
1035 &bits);
1036 BUG_ON(err == -EEXIST);
1037 if (err) {
1038 free_extent_state(prealloc);
1039 prealloc = NULL;
1040 goto out;
1041 }
1042 prealloc = NULL;
1043 start = this_end + 1;
1044 goto search_again;
1045 }
1046 /*
1047 * | ---- desired range ---- |
1048 * | state |
1049 * We need to split the extent, and set the bit
1050 * on the first half
1051 */
1052 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc)
1055 return -ENOMEM;
1056
1057 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST);
1059
1060 set_state_bits(tree, prealloc, &bits);
1061 clear_state_bit(tree, prealloc, &clear_bits, 0);
1062
1063 merge_state(tree, prealloc);
1064 prealloc = NULL;
1065 goto out;
1066 }
1067
1068 goto search_again;
1069
1070out:
1071 spin_unlock(&tree->lock);
1072 if (prealloc)
1073 free_extent_state(prealloc);
1074
1075 return err;
1076
1077search_again:
1078 if (start > end)
1079 goto out;
1080 spin_unlock(&tree->lock);
1081 if (mask & __GFP_WAIT)
1082 cond_resched();
1083 goto again;
1084}
1085
897/* wrappers around set/clear extent bit */ 1086/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1087int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1088 gfp_t mask)
@@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1108 struct extent_state **cached_state, gfp_t mask)
920{ 1109{
921 return set_extent_bit(tree, start, end, 1110 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1111 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1112 0, NULL, cached_state, mask);
924} 1113}
925 1114
@@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1788 return 0;
1600} 1789}
1601 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2154
1604/* 2155/*
@@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2249 struct extent_state *state;
1699 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2255
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2281 state);
1728 if (ret) 2282 if (ret)
1729 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1730 } 2286 }
1731 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2288 u64 failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (u64)bio->bi_bdev;
1734 start, end, NULL); 2290 if (tree->ops && tree->ops->readpage_io_failed_hook)
2291 ret = tree->ops->readpage_io_failed_hook(
2292 bio, page, start, end,
2293 failed_mirror, state);
2294 else
2295 ret = bio_readpage_error(bio, page, start, end,
2296 failed_mirror, NULL);
1735 if (ret == 0) { 2297 if (ret == 0) {
1736 uptodate = 2298 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2299 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2373 mirror_num, bio_flags, start);
1812 else 2374 else
1813 submit_bio(rw, bio); 2375 submit_bio(rw, bio);
2376
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2377 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2378 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2379 bio_put(bio);
@@ -2076,16 +2639,16 @@ out:
2076} 2639}
2077 2640
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2641int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2642 get_extent_t *get_extent, int mirror_num)
2080{ 2643{
2081 struct bio *bio = NULL; 2644 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2645 unsigned long bio_flags = 0;
2083 int ret; 2646 int ret;
2084 2647
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2648 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2649 &bio_flags);
2087 if (bio) 2650 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2651 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2652 return ret;
2090} 2653}
2091 2654
@@ -2136,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2699 int compressed;
2137 int write_flags; 2700 int write_flags;
2138 unsigned long nr_written = 0; 2701 unsigned long nr_written = 0;
2702 bool fill_delalloc = true;
2139 2703
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2704 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2705 write_flags = WRITE_SYNC;
@@ -2145,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2709 trace___extent_writepage(page, inode, wbc);
2146 2710
2147 WARN_ON(!PageLocked(page)); 2711 WARN_ON(!PageLocked(page));
2712
2713 ClearPageError(page);
2714
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2715 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2716 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2717 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2733
2167 set_page_extent_mapped(page); 2734 set_page_extent_mapped(page);
2168 2735
2736 if (!tree->ops || !tree->ops->fill_delalloc)
2737 fill_delalloc = false;
2738
2169 delalloc_start = start; 2739 delalloc_start = start;
2170 delalloc_end = 0; 2740 delalloc_end = 0;
2171 page_started = 0; 2741 page_started = 0;
2172 if (!epd->extent_locked) { 2742 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2743 u64 delalloc_to_write = 0;
2174 /* 2744 /*
2175 * make sure the wbc mapping index is at least updated 2745 * make sure the wbc mapping index is at least updated
@@ -2421,10 +2991,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 2991 * swizzled back from swapper_space to tmpfs file
2422 * mapping 2992 * mapping
2423 */ 2993 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2994 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 2995 tree->ops->write_cache_pages_lock_hook) {
2426 else 2996 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 2997 data, flush_fn);
2998 } else {
2999 if (!trylock_page(page)) {
3000 flush_fn(data);
3001 lock_page(page);
3002 }
3003 }
2428 3004
2429 if (unlikely(page->mapping != mapping)) { 3005 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3006 unlock_page(page);
@@ -2926,7 +3502,7 @@ out:
2926 return ret; 3502 return ret;
2927} 3503}
2928 3504
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3505inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3506 unsigned long i)
2931{ 3507{
2932 struct page *p; 3508 struct page *p;
@@ -2951,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3527 return p;
2952} 3528}
2953 3529
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3530inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3531{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3532 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3533 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3780 PAGECACHE_TAG_DIRTY);
3205 } 3781 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3782 spin_unlock_irq(&page->mapping->tree_lock);
3783 ClearPageError(page);
3207 unlock_page(page); 3784 unlock_page(page);
3208 } 3785 }
3209 return 0; 3786 return 0;
@@ -3349,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3926}
3350 3927
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3928int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3929 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3930 get_extent_t *get_extent, int mirror_num)
3355{ 3931{
3356 unsigned long i; 3932 unsigned long i;
@@ -3386,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3962 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3963 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3964 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3965 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3966 if (!trylock_page(page))
3391 goto unlock_exit; 3967 goto unlock_exit;
3392 } else { 3968 } else {
@@ -3430,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4006 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4007 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4008
3433 if (ret || !wait) 4009 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4010 return ret;
3435 4011
3436 for (i = start_i; i < num_pages; i++) { 4012 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..feb9be0e23bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 24
@@ -32,6 +34,7 @@
32#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
35 38
36/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
67 unsigned long bio_flags); 70 unsigned long bio_flags);
68 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
69 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
70 u64 start, u64 end, 73 u64 start, u64 end, u64 failed_mirror,
71 struct extent_state *state); 74 struct extent_state *state);
72 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, 76 u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
85 struct extent_state *other); 88 struct extent_state *other);
86 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
89}; 93};
90 94
91struct extent_io_tree { 95struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
186 gfp_t mask); 190 gfp_t mask);
187int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
188 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
189int __init extent_io_init(void); 193int __init extent_io_init(void);
190void extent_io_exit(void); 194void extent_io_exit(void);
191 195
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 218 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
248struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
249 u64 start, unsigned long len); 255 u64 start, unsigned long len);
250void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
251int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
252 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
253 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
254 265
255static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
256{ 267{
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300struct bio * 311struct bio *
301btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
302 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
303#endif 320#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1266f6e9cdb2..dafdfa059bf6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1616 goto out;
1616 } 1617 }
1617 1618
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1619 locked_end = alloc_end - 1;
1623 while (1) { 1620 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1621 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1661 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1662 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1663 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1664
1665 /*
1666 * Make sure we have enough space before we do the
1667 * allocation.
1668 */
1669 ret = btrfs_check_data_free_space(inode, last_byte -
1670 cur_offset);
1671 if (ret) {
1672 free_extent_map(em);
1673 break;
1674 }
1675
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1676 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1677 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1678 1 << inode->i_blkbits,
1670 offset + len, 1679 offset + len,
1671 &alloc_hint); 1680 &alloc_hint);
1681
1682 /* Let go of our reservation. */
1683 btrfs_free_reserved_data_space(inode, last_byte -
1684 cur_offset);
1672 if (ret < 0) { 1685 if (ret < 0) {
1673 free_extent_map(em); 1686 free_extent_map(em);
1674 break; 1687 break;
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1707 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1709 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1710out:
1700 mutex_unlock(&inode->i_mutex); 1711 mutex_unlock(&inode->i_mutex);
1701 return ret; 1712 return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..7a15fcfb3e1f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,342 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 return 0;
355}
356
357static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
358{
359 u64 *val;
360
361 io_ctl_map_page(io_ctl, 1);
362
363 /*
364 * Skip the csum areas. If we don't check crcs then we just have a
365 * 64bit chunk at the front of the first page.
366 */
367 if (io_ctl->check_crcs) {
368 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
369 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
370 } else {
371 io_ctl->cur += sizeof(u64);
372 io_ctl->size -= sizeof(u64) * 2;
373 }
374
375 val = io_ctl->cur;
376 *val = cpu_to_le64(generation);
377 io_ctl->cur += sizeof(u64);
378}
379
380static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
381{
382 u64 *gen;
383
384 /*
385 * Skip the crc area. If we don't check crcs then we just have a 64bit
386 * chunk at the front of the first page.
387 */
388 if (io_ctl->check_crcs) {
389 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
390 io_ctl->size -= sizeof(u64) +
391 (sizeof(u32) * io_ctl->num_pages);
392 } else {
393 io_ctl->cur += sizeof(u64);
394 io_ctl->size -= sizeof(u64) * 2;
395 }
396
397 gen = io_ctl->cur;
398 if (le64_to_cpu(*gen) != generation) {
399 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
400 "(%Lu) does not match inode (%Lu)\n", *gen,
401 generation);
402 io_ctl_unmap_page(io_ctl);
403 return -EIO;
404 }
405 io_ctl->cur += sizeof(u64);
406 return 0;
407}
408
409static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
410{
411 u32 *tmp;
412 u32 crc = ~(u32)0;
413 unsigned offset = 0;
414
415 if (!io_ctl->check_crcs) {
416 io_ctl_unmap_page(io_ctl);
417 return;
418 }
419
420 if (index == 0)
421 offset = sizeof(u32) * io_ctl->num_pages;;
422
423 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
424 PAGE_CACHE_SIZE - offset);
425 btrfs_csum_final(crc, (char *)&crc);
426 io_ctl_unmap_page(io_ctl);
427 tmp = kmap(io_ctl->pages[0]);
428 tmp += index;
429 *tmp = crc;
430 kunmap(io_ctl->pages[0]);
431}
432
433static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
434{
435 u32 *tmp, val;
436 u32 crc = ~(u32)0;
437 unsigned offset = 0;
438
439 if (!io_ctl->check_crcs) {
440 io_ctl_map_page(io_ctl, 0);
441 return 0;
442 }
443
444 if (index == 0)
445 offset = sizeof(u32) * io_ctl->num_pages;
446
447 tmp = kmap(io_ctl->pages[0]);
448 tmp += index;
449 val = *tmp;
450 kunmap(io_ctl->pages[0]);
451
452 io_ctl_map_page(io_ctl, 0);
453 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
454 PAGE_CACHE_SIZE - offset);
455 btrfs_csum_final(crc, (char *)&crc);
456 if (val != crc) {
457 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
458 "space cache\n");
459 io_ctl_unmap_page(io_ctl);
460 return -EIO;
461 }
462
463 return 0;
464}
465
466static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
467 void *bitmap)
468{
469 struct btrfs_free_space_entry *entry;
470
471 if (!io_ctl->cur)
472 return -ENOSPC;
473
474 entry = io_ctl->cur;
475 entry->offset = cpu_to_le64(offset);
476 entry->bytes = cpu_to_le64(bytes);
477 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
478 BTRFS_FREE_SPACE_EXTENT;
479 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
480 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
481
482 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
483 return 0;
484
485 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
486
487 /* No more pages to map */
488 if (io_ctl->index >= io_ctl->num_pages)
489 return 0;
490
491 /* map the next page */
492 io_ctl_map_page(io_ctl, 1);
493 return 0;
494}
495
496static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
497{
498 if (!io_ctl->cur)
499 return -ENOSPC;
500
501 /*
502 * If we aren't at the start of the current page, unmap this one and
503 * map the next one if there is any left.
504 */
505 if (io_ctl->cur != io_ctl->orig) {
506 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
507 if (io_ctl->index >= io_ctl->num_pages)
508 return -ENOSPC;
509 io_ctl_map_page(io_ctl, 0);
510 }
511
512 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
513 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
514 if (io_ctl->index < io_ctl->num_pages)
515 io_ctl_map_page(io_ctl, 0);
516 return 0;
517}
518
519static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
520{
521 /*
522 * If we're not on the boundary we know we've modified the page and we
523 * need to crc the page.
524 */
525 if (io_ctl->cur != io_ctl->orig)
526 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
527 else
528 io_ctl_unmap_page(io_ctl);
529
530 while (io_ctl->index < io_ctl->num_pages) {
531 io_ctl_map_page(io_ctl, 1);
532 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
533 }
534}
535
536static int io_ctl_read_entry(struct io_ctl *io_ctl,
537 struct btrfs_free_space *entry, u8 *type)
538{
539 struct btrfs_free_space_entry *e;
540
541 e = io_ctl->cur;
542 entry->offset = le64_to_cpu(e->offset);
543 entry->bytes = le64_to_cpu(e->bytes);
544 *type = e->type;
545 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
546 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
547
548 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
549 return 0;
550
551 io_ctl_unmap_page(io_ctl);
552
553 if (io_ctl->index >= io_ctl->num_pages)
554 return 0;
555
556 return io_ctl_check_crc(io_ctl, io_ctl->index);
557}
558
559static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
560 struct btrfs_free_space *entry)
561{
562 int ret;
563
564 if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
565 io_ctl_unmap_page(io_ctl);
566
567 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
568 if (ret)
569 return ret;
570
571 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
572 io_ctl_unmap_page(io_ctl);
573
574 return 0;
575}
576
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 577int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 578 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 579 struct btrfs_path *path, u64 offset)
248{ 580{
249 struct btrfs_free_space_header *header; 581 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 582 struct extent_buffer *leaf;
251 struct page *page; 583 struct io_ctl io_ctl;
252 struct btrfs_key key; 584 struct btrfs_key key;
585 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 586 struct list_head bitmaps;
254 u64 num_entries; 587 u64 num_entries;
255 u64 num_bitmaps; 588 u64 num_bitmaps;
256 u64 generation; 589 u64 generation;
257 pgoff_t index = 0; 590 u8 type;
258 int ret = 0; 591 int ret = 0;
259 592
260 INIT_LIST_HEAD(&bitmaps); 593 INIT_LIST_HEAD(&bitmaps);
261 594
262 /* Nothing in the space cache, goodbye */ 595 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 596 if (!i_size_read(inode))
264 goto out; 597 return 0;
265 598
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 599 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 600 key.offset = offset;
@@ -269,11 +602,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 602
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 603 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 604 if (ret < 0)
272 goto out; 605 return 0;
273 else if (ret > 0) { 606 else if (ret > 0) {
274 btrfs_release_path(path); 607 btrfs_release_path(path);
275 ret = 0; 608 return 0;
276 goto out;
277 } 609 }
278 610
279 ret = -1; 611 ret = -1;
@@ -291,169 +623,100 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 623 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 624 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 625 (unsigned long long)generation);
294 goto out; 626 return 0;
295 } 627 }
296 628
297 if (!num_entries) 629 if (!num_entries)
298 goto out; 630 return 0;
299 631
632 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 633 ret = readahead_cache(inode);
301 if (ret) 634 if (ret)
302 goto out; 635 goto out;
303 636
304 while (1) { 637 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 638 if (ret)
306 struct btrfs_free_space *e; 639 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 640
311 if (!num_entries && !num_bitmaps) 641 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 642 if (ret)
643 goto free_cache;
313 644
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 645 ret = io_ctl_check_generation(&io_ctl, generation);
315 if (!page) 646 if (ret)
647 goto free_cache;
648
649 while (num_entries) {
650 e = kmem_cache_zalloc(btrfs_free_space_cachep,
651 GFP_NOFS);
652 if (!e)
316 goto free_cache; 653 goto free_cache;
317 654
318 if (!PageUptodate(page)) { 655 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 656 if (ret) {
320 lock_page(page); 657 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 658 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 659 }
329 addr = kmap(page);
330 660
331 if (index == 0) { 661 if (!e->bytes) {
332 u64 *gen; 662 kmem_cache_free(btrfs_free_space_cachep, e);
663 goto free_cache;
664 }
333 665
334 /* 666 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 667 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 668 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 669 spin_unlock(&ctl->tree_lock);
338 */ 670 if (ret) {
339 addr += sizeof(u64); 671 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 672 "free space cache, dumping\n");
341 673 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 674 goto free_cache;
353 } 675 }
354 addr += sizeof(u64); 676 } else {
355 offset += sizeof(u64); 677 BUG_ON(!num_bitmaps);
356 } 678 num_bitmaps--;
357 entry = addr; 679 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 680 if (!e->bitmap) {
359 while (1) { 681 kmem_cache_free(
360 if (!num_entries) 682 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 683 goto free_cache;
371 } 684 }
372 685 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 686 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 687 ctl->total_bitmaps++;
375 if (!e->bytes) { 688 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 689 spin_unlock(&ctl->tree_lock);
690 if (ret) {
691 printk(KERN_ERR "Duplicate entries in "
692 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 693 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 694 goto free_cache;
381 } 695 }
382 696 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 697 }
428 698
429 /* 699 num_entries--;
430 * We read an entry out of this page, we need to move on to the 700 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 701
438 /* 702 /*
439 * We add the bitmaps at the end of the entries in order that 703 * We add the bitmaps at the end of the entries in order that
440 * the bitmap entries are added to the cache. 704 * the bitmap entries are added to the cache.
441 */ 705 */
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 706 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 707 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 708 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 709 if (ret)
446 num_bitmaps--; 710 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 711 }
452 712
713 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 714 ret = 1;
454out: 715out:
716 io_ctl_free(&io_ctl);
455 return ret; 717 return ret;
456free_cache: 718free_cache:
719 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 720 __btrfs_remove_free_space_cache(ctl);
458 goto out; 721 goto out;
459} 722}
@@ -465,7 +728,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 728 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 729 struct inode *inode;
467 struct btrfs_path *path; 730 struct btrfs_path *path;
468 int ret; 731 int ret = 0;
469 bool matched; 732 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 733 u64 used = btrfs_block_group_used(&block_group->item);
471 734
@@ -497,6 +760,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 760 return 0;
498 } 761 }
499 762
763 /* We may have converted the inode and made the cache invalid. */
764 spin_lock(&block_group->lock);
765 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
766 spin_unlock(&block_group->lock);
767 goto out;
768 }
769 spin_unlock(&block_group->lock);
770
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 771 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 772 path, block_group->key.objectid);
502 btrfs_free_path(path); 773 btrfs_free_path(path);
@@ -530,6 +801,19 @@ out:
530 return ret; 801 return ret;
531} 802}
532 803
804/**
805 * __btrfs_write_out_cache - write out cached info to an inode
806 * @root - the root the inode belongs to
807 * @ctl - the free space cache we are going to write out
808 * @block_group - the block_group for this cache if it belongs to a block_group
809 * @trans - the trans handle
810 * @path - the path to use
811 * @offset - the offset for the key we'll insert
812 *
813 * This function writes out a free space cache struct to disk for quick recovery
814 * on mount. This will return 0 if it was successfull in writing the cache out,
815 * and -1 if it was not.
816 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 817int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 818 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 819 struct btrfs_block_group_cache *block_group,
@@ -540,42 +824,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 824 struct extent_buffer *leaf;
541 struct rb_node *node; 825 struct rb_node *node;
542 struct list_head *pos, *n; 826 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 827 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 828 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 829 struct extent_io_tree *unpin = NULL;
830 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 831 struct list_head bitmap_list;
549 struct btrfs_key key; 832 struct btrfs_key key;
550 u64 start, end, len; 833 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 834 int entries = 0;
555 int bitmaps = 0; 835 int bitmaps = 0;
556 int ret = -1; 836 int ret;
557 bool next_page = false; 837 int err = -1;
558 bool out_of_space = false;
559 838
560 INIT_LIST_HEAD(&bitmap_list); 839 INIT_LIST_HEAD(&bitmap_list);
561 840
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 841 if (!i_size_read(inode))
567 return -1; 842 return -1;
568 843
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 844 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 845
580 /* Get the cluster for this block_group if it exists */ 846 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 847 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +855,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 855 */
590 unpin = root->fs_info->pinned_extents; 856 unpin = root->fs_info->pinned_extents;
591 857
592 /* 858 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 859 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614 860
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 861 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 862 0, &cached_state, GFP_NOFS);
618 863
@@ -623,189 +868,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 868 if (block_group)
624 start = block_group->key.objectid; 869 start = block_group->key.objectid;
625 870
626 /* Write out the extent entries */ 871 node = rb_first(&ctl->free_space_offset);
627 do { 872 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 873 node = rb_first(&cluster->root);
629 void *addr, *orig; 874 cluster = NULL;
630 unsigned long offset = 0; 875 }
631 876
632 next_page = false; 877 /* Make sure we can fit our crcs into the first page */
878 if (io_ctl.check_crcs &&
879 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
880 WARN_ON(1);
881 goto out_nospc;
882 }
633 883
634 if (index >= num_pages) { 884 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 885
639 page = pages[index]; 886 /* Write out the extent entries */
887 while (node) {
888 struct btrfs_free_space *e;
640 889
641 orig = addr = kmap(page); 890 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 891 entries++;
643 u64 *gen;
644 892
645 /* 893 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 894 e->bitmap);
647 * make sure that old kernels who aren't aware of this 895 if (ret)
648 * format will be sure to discard the cache. 896 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 897
653 gen = addr; 898 if (e->bitmap) {
654 *gen = trans->transid; 899 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 900 bitmaps++;
656 offset += sizeof(u64);
657 } 901 }
658 entry = addr; 902 node = rb_next(node);
659 903 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 904 node = rb_first(&cluster->root);
661 while (node && !next_page) { 905 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 906 }
907 }
687 908
688 /* 909 /*
689 * We want to add any pinned extents to our free space cache 910 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 911 * so we don't leak the space
691 */ 912 */
692 while (block_group && !next_page && 913 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 914 block_group->key.offset)) {
694 block_group->key.offset)) { 915 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 916 EXTENT_DIRTY);
696 EXTENT_DIRTY); 917 if (ret) {
697 if (ret) { 918 ret = 0;
698 ret = 0; 919 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 920 }
723 921
724 /* Generate bogus crc value */ 922 /* This pinned extent is out of our range */
725 if (index == 0) { 923 if (start >= block_group->key.objectid +
726 u32 *tmp; 924 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 925 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 926
735 kunmap(page); 927 len = block_group->key.objectid +
928 block_group->key.offset - start;
929 len = min(len, end + 1 - start);
736 930
737 bytes += PAGE_CACHE_SIZE; 931 entries++;
932 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
933 if (ret)
934 goto out_nospc;
738 935
739 index++; 936 start = end + 1;
740 } while (node || next_page); 937 }
741 938
742 /* Write out the bitmaps */ 939 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 940 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 941 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 942 list_entry(pos, struct btrfs_free_space, list);
747 943
748 if (index >= num_pages) { 944 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 945 if (ret)
750 break; 946 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 947 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 948 }
771 949
772 /* Zero out the rest of the pages just to make sure */ 950 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 951 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775
776 page = pages[index];
777 addr = kmap(page);
778 memset(addr, 0, PAGE_CACHE_SIZE);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783 952
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0, 953 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
785 bytes, &cached_state); 954 0, i_size_read(inode), &cached_state);
786 btrfs_drop_pages(pages, num_pages); 955 io_ctl_drop_pages(&io_ctl);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 956 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 957 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 958
790 if (ret) { 959 if (ret)
791 ret = 0;
792 goto out; 960 goto out;
793 }
794 961
795 BTRFS_I(inode)->generation = trans->transid;
796 962
797 filemap_write_and_wait(inode->i_mapping); 963 ret = filemap_write_and_wait(inode->i_mapping);
964 if (ret)
965 goto out;
798 966
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 967 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 968 key.offset = offset;
801 key.type = 0; 969 key.type = 0;
802 970
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 971 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 972 if (ret < 0) {
805 ret = -1; 973 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 974 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 975 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 976 goto out;
810 } 977 }
811 leaf = path->nodes[0]; 978 leaf = path->nodes[0];
@@ -816,15 +983,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 983 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 984 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 985 found_key.offset != offset) {
819 ret = -1; 986 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 987 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 988 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 989 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 990 btrfs_release_path(path);
825 goto out; 991 goto out;
826 } 992 }
827 } 993 }
994
995 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 996 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 997 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 998 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1001,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1001 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1002 btrfs_release_path(path);
835 1003
836 ret = 1; 1004 err = 0;
837
838out: 1005out:
839 kfree(pages); 1006 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1007 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1008 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1009 BTRFS_I(inode)->generation = 0;
843 } 1010 }
844 btrfs_update_inode(trans, root, inode); 1011 btrfs_update_inode(trans, root, inode);
845 return ret; 1012 return err;
1013
1014out_nospc:
1015 list_for_each_safe(pos, n, &bitmap_list) {
1016 struct btrfs_free_space *entry =
1017 list_entry(pos, struct btrfs_free_space, list);
1018 list_del_init(&entry->list);
1019 }
1020 io_ctl_drop_pages(&io_ctl);
1021 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1022 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1023 goto out;
846} 1024}
847 1025
848int btrfs_write_out_cache(struct btrfs_root *root, 1026int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1047,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1047
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1048 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1049 path, block_group->key.objectid);
872 if (ret < 0) { 1050 if (ret) {
873 spin_lock(&block_group->lock); 1051 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1052 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1053 spin_unlock(&block_group->lock);
876 ret = 0; 1054 ret = 0;
877 1055#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1056 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1057 "for block group %llu\n", block_group->key.objectid);
1058#endif
880 } 1059 }
881 1060
882 iput(inode); 1061 iput(inode);
@@ -1701,6 +1880,7 @@ again:
1701 ctl->total_bitmaps--; 1880 ctl->total_bitmaps--;
1702 } 1881 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1882 kmem_cache_free(btrfs_free_space_cachep, info);
1883 ret = 0;
1704 goto out_lock; 1884 goto out_lock;
1705 } 1885 }
1706 1886
@@ -1708,7 +1888,8 @@ again:
1708 unlink_free_space(ctl, info); 1888 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1889 info->offset += bytes;
1710 info->bytes -= bytes; 1890 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1891 ret = link_free_space(ctl, info);
1892 WARN_ON(ret);
1712 goto out_lock; 1893 goto out_lock;
1713 } 1894 }
1714 1895
@@ -2472,9 +2653,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2653 spin_unlock(&ctl->tree_lock);
2473 2654
2474 if (bytes >= minlen) { 2655 if (bytes >= minlen) {
2475 int update_ret; 2656 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2657 int update = 0;
2477 bytes, 1, 1); 2658
2659 space_info = block_group->space_info;
2660 spin_lock(&space_info->lock);
2661 spin_lock(&block_group->lock);
2662 if (!block_group->ro) {
2663 block_group->reserved += bytes;
2664 space_info->bytes_reserved += bytes;
2665 update = 1;
2666 }
2667 spin_unlock(&block_group->lock);
2668 spin_unlock(&space_info->lock);
2478 2669
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2670 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2671 start,
@@ -2482,9 +2673,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2673 &actually_trimmed);
2483 2674
2484 btrfs_add_free_space(block_group, start, bytes); 2675 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2676 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2677 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2678 spin_lock(&block_group->lock);
2679 if (block_group->ro)
2680 space_info->bytes_readonly += bytes;
2681 block_group->reserved -= bytes;
2682 space_info->bytes_reserved -= bytes;
2683 spin_unlock(&space_info->lock);
2684 spin_unlock(&block_group->lock);
2685 }
2488 2686
2489 if (ret) 2687 if (ret)
2490 break; 2688 break;
@@ -2643,9 +2841,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2841 return 0;
2644 2842
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2843 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2844 if (ret) {
2845 btrfs_delalloc_release_metadata(inode, inode->i_size);
2846#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2847 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2848 "for root %llu\n", root->root_key.objectid);
2849#endif
2850 }
2649 2851
2650 iput(inode); 2852 iput(inode);
2651 return ret; 2853 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..53dcbdf446cd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ again:
465 /* Just to make sure we have enough space */ 465 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 466 prealloc += 8 * PAGE_CACHE_SIZE;
467 467
468 ret = btrfs_check_data_free_space(inode, prealloc); 468 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 469 if (ret)
470 goto out_put; 470 goto out_put;
471 471
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 473 prealloc, prealloc, &alloc_hint);
474 if (ret) 474 if (ret) {
475 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 476 goto out_put;
477 }
476 btrfs_free_reserved_data_space(inode, prealloc); 478 btrfs_free_reserved_data_space(inode, prealloc);
477 479
478out_put: 480out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 75686a61bd45..966ddcc4c63d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
45#include "btrfs_inode.h" 45#include "btrfs_inode.h"
46#include "ioctl.h" 46#include "ioctl.h"
47#include "print-tree.h" 47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 48#include "ordered-data.h"
50#include "xattr.h" 49#include "xattr.h"
51#include "tree-log.h" 50#include "tree-log.h"
51#include "volumes.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h" 54#include "free-space-cache.h"
@@ -393,7 +393,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 394 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 396 if (!pages) {
397 /* just bail out to the uncompressed code */
398 goto cont;
399 }
397 400
398 if (BTRFS_I(inode)->force_compress) 401 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 402 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +427,7 @@ again:
424 will_compress = 1; 427 will_compress = 1;
425 } 428 }
426 } 429 }
430cont:
427 if (start == 0) { 431 if (start == 0) {
428 trans = btrfs_join_transaction(root); 432 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 433 BUG_ON(IS_ERR(trans));
@@ -820,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 824 }
821 825
822 BUG_ON(disk_num_bytes > 826 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 827 btrfs_super_total_bytes(root->fs_info->super_copy));
824 828
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 829 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 830 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1792,12 +1796,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1792 } 1796 }
1793 ret = 0; 1797 ret = 0;
1794out: 1798out:
1795 if (nolock) { 1799 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1800 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1801 if (trans) {
1802 if (nolock)
1803 btrfs_end_transaction_nolock(trans, root);
1804 else
1801 btrfs_end_transaction(trans, root); 1805 btrfs_end_transaction(trans, root);
1802 } 1806 }
1803 1807
@@ -1819,153 +1823,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1823}
1820 1824
1821/* 1825/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = failed_bio->bi_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1826 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1827 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1828 * extent_io.c will try to find good copies for us.
1969 */ 1829 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1830static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1831 struct extent_state *state)
@@ -2011,10 +1871,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1871
2012 kunmap_atomic(kaddr, KM_USER0); 1872 kunmap_atomic(kaddr, KM_USER0);
2013good: 1873good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1874 return 0;
2019 1875
2020zeroit: 1876zeroit:
@@ -2079,89 +1935,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 1935 up_read(&root->fs_info->cleanup_work_sem);
2080} 1936}
2081 1937
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 1938enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 1939 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 1940 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2020,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2020 }
2248 spin_unlock(&root->orphan_lock); 2021 spin_unlock(&root->orphan_lock);
2249 2022
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2023 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2024 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2025 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2316,6 +2086,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2086 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2087 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2088 struct inode *inode;
2089 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2090 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2091
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2092 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2138,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2138 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2139 * offset of the orphan item.
2369 */ 2140 */
2141
2142 if (found_key.offset == last_objectid) {
2143 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2144 "stopping orphan cleanup\n");
2145 ret = -EINVAL;
2146 goto out;
2147 }
2148
2149 last_objectid = found_key.offset;
2150
2370 found_key.objectid = found_key.offset; 2151 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2152 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2153 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2154 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2155 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2156 if (ret && ret != -ESTALE)
2376 goto out; 2157 goto out;
2377 }
2378 2158
2379 /* 2159 /*
2380 * add this inode to the orphan list so btrfs_orphan_del does 2160 * Inode is already gone but the orphan item is still there,
2381 * the proper thing when we hit it 2161 * kill the orphan item.
2382 */ 2162 */
2383 spin_lock(&root->orphan_lock); 2163 if (ret == -ESTALE) {
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2164 trans = btrfs_start_transaction(root, 1);
2385 spin_unlock(&root->orphan_lock);
2386
2387 /*
2388 * if this is a bad inode, means we actually succeeded in
2389 * removing the inode, but not the orphan record, which means
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */
2393 if (is_bad_inode(inode)) {
2394 trans = btrfs_start_transaction(root, 0);
2395 if (IS_ERR(trans)) { 2165 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2166 ret = PTR_ERR(trans);
2397 goto out; 2167 goto out;
2398 } 2168 }
2399 btrfs_orphan_del(trans, inode); 2169 ret = btrfs_del_orphan_item(trans, root,
2170 found_key.objectid);
2171 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2172 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2173 continue;
2403 } 2174 }
2404 2175
2176 /*
2177 * add this inode to the orphan list so btrfs_orphan_del does
2178 * the proper thing when we hit it
2179 */
2180 spin_lock(&root->orphan_lock);
2181 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2182 spin_unlock(&root->orphan_lock);
2183
2405 /* if we have links, this was a truncate, lets do that */ 2184 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2185 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2186 if (!S_ISREG(inode->i_mode)) {
@@ -2835,7 +2614,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2614 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2615 u64 dir_ino = btrfs_ino(dir);
2837 2616
2838 trans = btrfs_start_transaction(root, 10); 2617 /*
2618 * 1 for the possible orphan item
2619 * 1 for the dir item
2620 * 1 for the dir index
2621 * 1 for the inode ref
2622 * 1 for the inode ref in the tree log
2623 * 2 for the dir entries in the log
2624 * 1 for the inode
2625 */
2626 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2627 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2628 return trans;
2841 2629
@@ -2858,7 +2646,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2646 return ERR_PTR(-ENOMEM);
2859 } 2647 }
2860 2648
2861 trans = btrfs_start_transaction(root, 0); 2649 /* 1 for the orphan item */
2650 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2651 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2652 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2653 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2752,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2752 err = 0;
2964out: 2753out:
2965 btrfs_free_path(path); 2754 btrfs_free_path(path);
2755 /* Migrate the orphan reservation over */
2756 if (!err)
2757 err = btrfs_block_rsv_migrate(trans->block_rsv,
2758 &root->fs_info->global_block_rsv,
2759 trans->bytes_reserved);
2760
2966 if (err) { 2761 if (err) {
2967 btrfs_end_transaction(trans, root); 2762 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2763 root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2772,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2977 struct btrfs_root *root) 2772 struct btrfs_root *root)
2978{ 2773{
2979 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2774 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2775 btrfs_block_rsv_release(root, trans->block_rsv,
2776 trans->bytes_reserved);
2777 trans->block_rsv = &root->fs_info->trans_block_rsv;
2980 BUG_ON(!root->fs_info->enospc_unlink); 2778 BUG_ON(!root->fs_info->enospc_unlink);
2981 root->fs_info->enospc_unlink = 0; 2779 root->fs_info->enospc_unlink = 0;
2982 } 2780 }
@@ -3368,6 +3166,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3166 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3167 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3168 struct page *page;
3169 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3170 int ret = 0;
3372 u64 page_start; 3171 u64 page_start;
3373 u64 page_end; 3172 u64 page_end;
@@ -3380,7 +3179,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3179
3381 ret = -ENOMEM; 3180 ret = -ENOMEM;
3382again: 3181again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3182 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3183 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3184 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3185 goto out;
@@ -3613,6 +3412,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3412{
3614 struct btrfs_trans_handle *trans; 3413 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3414 struct btrfs_root *root = BTRFS_I(inode)->root;
3415 struct btrfs_block_rsv *rsv, *global_rsv;
3416 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3417 unsigned long nr;
3617 int ret; 3418 int ret;
3618 3419
@@ -3640,22 +3441,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3441 goto no_delete;
3641 } 3442 }
3642 3443
3444 rsv = btrfs_alloc_block_rsv(root);
3445 if (!rsv) {
3446 btrfs_orphan_del(NULL, inode);
3447 goto no_delete;
3448 }
3449 rsv->size = min_size;
3450 global_rsv = &root->fs_info->global_block_rsv;
3451
3643 btrfs_i_size_write(inode, 0); 3452 btrfs_i_size_write(inode, 0);
3644 3453
3454 /*
3455 * This is a bit simpler than btrfs_truncate since
3456 *
3457 * 1) We've already reserved our space for our orphan item in the
3458 * unlink.
3459 * 2) We're going to delete the inode item, so we don't need to update
3460 * it at all.
3461 *
3462 * So we just need to reserve some slack space in case we add bytes when
3463 * doing the truncate.
3464 */
3645 while (1) { 3465 while (1) {
3646 trans = btrfs_join_transaction(root); 3466 ret = btrfs_block_rsv_refill(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3467
3648 trans->block_rsv = root->orphan_block_rsv; 3468 /*
3469 * Try and steal from the global reserve since we will
3470 * likely not use this space anyway, we want to try as
3471 * hard as possible to get this to work.
3472 */
3473 if (ret)
3474 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3475
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3476 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3477 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3478 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3479 btrfs_orphan_del(NULL, inode);
3656 continue; 3480 btrfs_free_block_rsv(root, rsv);
3481 goto no_delete;
3657 } 3482 }
3658 3483
3484 trans = btrfs_start_transaction(root, 0);
3485 if (IS_ERR(trans)) {
3486 btrfs_orphan_del(NULL, inode);
3487 btrfs_free_block_rsv(root, rsv);
3488 goto no_delete;
3489 }
3490
3491 trans->block_rsv = rsv;
3492
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3493 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3494 if (ret != -EAGAIN)
3661 break; 3495 break;
@@ -3664,14 +3498,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3498 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3499 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3500 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3501 }
3669 3502
3503 btrfs_free_block_rsv(root, rsv);
3504
3670 if (ret == 0) { 3505 if (ret == 0) {
3506 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3507 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3508 BUG_ON(ret);
3673 } 3509 }
3674 3510
3511 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3512 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3513 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3514 btrfs_return_ino(root, btrfs_ino(inode));
@@ -5795,8 +5632,7 @@ again:
5795 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5632 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5633 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5634 if (!ret)
5798 ret = btrfs_update_inode(trans, root, inode); 5635 err = btrfs_update_inode(trans, root, inode);
5799 err = ret;
5800 goto out; 5636 goto out;
5801 } 5637 }
5802 5638
@@ -6289,7 +6125,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6289{ 6125{
6290 struct extent_io_tree *tree; 6126 struct extent_io_tree *tree;
6291 tree = &BTRFS_I(page->mapping->host)->io_tree; 6127 tree = &BTRFS_I(page->mapping->host)->io_tree;
6292 return extent_read_full_page(tree, page, btrfs_get_extent); 6128 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6293} 6129}
6294 6130
6295static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6131static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6541,6 +6377,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6377 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6378 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6379 u64 mask = root->sectorsize - 1;
6380 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6381
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6382 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6383 if (ret)
@@ -6588,19 +6425,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6425 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6426 if (!rsv)
6590 return -ENOMEM; 6427 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6428 rsv->size = min_size;
6592 6429
6430 /*
6431 * 1 for the truncate slack space
6432 * 1 for the orphan item we're going to add
6433 * 1 for the orphan item deletion
6434 * 1 for updating the inode.
6435 */
6593 trans = btrfs_start_transaction(root, 4); 6436 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6437 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6438 err = PTR_ERR(trans);
6596 goto out; 6439 goto out;
6597 } 6440 }
6598 6441
6599 /* 6442 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6443 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6444 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6445 BUG_ON(ret);
6605 6446
6606 ret = btrfs_orphan_add(trans, inode); 6447 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6450,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6450 goto out;
6610 } 6451 }
6611 6452
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6453 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6454 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6455 * but that is only tested during the last file release. That
@@ -6645,20 +6471,30 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6471 btrfs_add_ordered_operation(trans, root, inode);
6646 6472
6647 while (1) { 6473 while (1) {
6474 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6475 if (ret) {
6476 /*
6477 * This can only happen with the original transaction we
6478 * started above, every other time we shouldn't have a
6479 * transaction started yet.
6480 */
6481 if (ret == -EAGAIN)
6482 goto end_trans;
6483 err = ret;
6484 break;
6485 }
6486
6648 if (!trans) { 6487 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6488 /* Just need the 1 for updating the inode */
6489 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6490 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6491 err = PTR_ERR(trans);
6652 goto out; 6492 goto out;
6653 } 6493 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6494 }
6661 6495
6496 trans->block_rsv = rsv;
6497
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6498 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6499 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6500 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6509,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6509 err = ret;
6674 break; 6510 break;
6675 } 6511 }
6676 6512end_trans:
6677 nr = trans->blocks_used; 6513 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6514 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6515 trans = NULL;
@@ -6755,9 +6591,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6591 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6592 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6593 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6594 ei->disk_i_size = 0;
6760 ei->flags = 0; 6595 ei->flags = 0;
6596 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6597 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6598 ei->last_unlink_trans = 0;
6763 6599
@@ -6803,6 +6639,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6639 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6640 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6641 WARN_ON(BTRFS_I(inode)->reserved_extents);
6642 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6643 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6644
6807 /* 6645 /*
6808 * This can happen where we create an inode, but somebody else also 6646 * This can happen where we create an inode, but somebody else also
@@ -7420,7 +7258,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7258 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7259 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7260 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7261 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7262 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7263 .merge_extent_hook = btrfs_merge_extent_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba5..4a34c472f126 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 118/*
118 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
119 * 120 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
121 */ 122 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 124{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 129
129 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
130 131
131 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 142
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
138} 144}
139 145
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
277 struct fstrim_range range; 283 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
280 int ret; 287 int ret;
281 288
282 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
295 } 302 }
296 } 303 }
297 rcu_read_unlock(); 304 rcu_read_unlock();
305
298 if (!num_devices) 306 if (!num_devices)
299 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
303 312
313 range.len = min(range.len, total_bytes - range.start);
304 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0) 316 if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
760 int ret = 1; 770 int ret = 1;
761 771
762 /* 772 /*
763 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
764 * defragging it 774 * defragging it
765 */ 775 */
766 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
805 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
806 */ 816 */
807 if (ret) { 817 if (ret) {
808 *last_len += len;
809 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
810 } else { 819 } else {
811 *last_len = 0; 820 *last_len = 0;
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 852 int i_done;
844 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 856
847 if (isize == 0) 857 if (isize == 0)
848 return 0; 858 return 0;
@@ -860,7 +870,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 870 for (i = 0; i < num_pages; i++) {
861 struct page *page; 871 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 872 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 873 start_index + i, mask);
864 if (!page) 874 if (!page)
865 break; 875 break;
866 876
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_super_block *disk_super; 982 struct btrfs_super_block *disk_super;
973 struct file_ra_state *ra = NULL; 983 struct file_ra_state *ra = NULL;
974 unsigned long last_index; 984 unsigned long last_index;
985 u64 isize = i_size_read(inode);
975 u64 features; 986 u64 features;
976 u64 last_len = 0; 987 u64 last_len = 0;
977 u64 skip = 0; 988 u64 skip = 0;
978 u64 defrag_end = 0; 989 u64 defrag_end = 0;
979 u64 newer_off = range->start; 990 u64 newer_off = range->start;
980 int newer_left = 0;
981 unsigned long i; 991 unsigned long i;
992 unsigned long ra_index = 0;
982 int ret; 993 int ret;
983 int defrag_count = 0; 994 int defrag_count = 0;
984 int compress_type = BTRFS_COMPRESS_ZLIB; 995 int compress_type = BTRFS_COMPRESS_ZLIB;
985 int extent_thresh = range->extent_thresh; 996 int extent_thresh = range->extent_thresh;
986 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 997 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
998 int cluster = max_cluster;
987 u64 new_align = ~((u64)128 * 1024 - 1); 999 u64 new_align = ~((u64)128 * 1024 - 1);
988 struct page **pages = NULL; 1000 struct page **pages = NULL;
989 1001
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
997 compress_type = range->compress_type; 1009 compress_type = range->compress_type;
998 } 1010 }
999 1011
1000 if (inode->i_size == 0) 1012 if (isize == 0)
1001 return 0; 1013 return 0;
1002 1014
1003 /* 1015 /*
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1013 ra = &file->f_ra; 1025 ra = &file->f_ra;
1014 } 1026 }
1015 1027
1016 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1028 pages = kmalloc(sizeof(struct page *) * max_cluster,
1017 GFP_NOFS); 1029 GFP_NOFS);
1018 if (!pages) { 1030 if (!pages) {
1019 ret = -ENOMEM; 1031 ret = -ENOMEM;
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1022 1034
1023 /* find the last page to defrag */ 1035 /* find the last page to defrag */
1024 if (range->start + range->len > range->start) { 1036 if (range->start + range->len > range->start) {
1025 last_index = min_t(u64, inode->i_size - 1, 1037 last_index = min_t(u64, isize - 1,
1026 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1038 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1027 } else { 1039 } else {
1028 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1040 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 } 1041 }
1030 1042
1031 if (newer_than) { 1043 if (newer_than) {
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1038 * the extents in the file evenly spaced 1050 * the extents in the file evenly spaced
1039 */ 1051 */
1040 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1052 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1041 newer_left = newer_cluster;
1042 } else 1053 } else
1043 goto out_ra; 1054 goto out_ra;
1044 } else { 1055 } else {
1045 i = range->start >> PAGE_CACHE_SHIFT; 1056 i = range->start >> PAGE_CACHE_SHIFT;
1046 } 1057 }
1047 if (!max_to_defrag) 1058 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1059 max_to_defrag = last_index;
1049 1060
1050 /* 1061 /*
1051 * make writeback starts from i, so the defrag range can be 1062 * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1079 i = max(i + 1, next); 1090 i = max(i + 1, next);
1080 continue; 1091 continue;
1081 } 1092 }
1093
1094 if (!newer_than) {
1095 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1096 PAGE_CACHE_SHIFT) - i;
1097 cluster = min(cluster, max_cluster);
1098 } else {
1099 cluster = max_cluster;
1100 }
1101
1082 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1102 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1083 BTRFS_I(inode)->force_compress = compress_type; 1103 BTRFS_I(inode)->force_compress = compress_type;
1084 1104
1085 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1105 if (i + cluster > ra_index) {
1106 ra_index = max(i, ra_index);
1107 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1108 cluster);
1109 ra_index += max_cluster;
1110 }
1086 1111
1087 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1112 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1088 if (ret < 0) 1113 if (ret < 0)
1089 goto out_ra; 1114 goto out_ra;
1090 1115
1091 defrag_count += ret; 1116 defrag_count += ret;
1092 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1117 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1093 i += ret;
1094 1118
1095 if (newer_than) { 1119 if (newer_than) {
1096 if (newer_off == (u64)-1) 1120 if (newer_off == (u64)-1)
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1105 if (!ret) { 1129 if (!ret) {
1106 range->start = newer_off; 1130 range->start = newer_off;
1107 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1131 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1108 newer_left = newer_cluster;
1109 } else { 1132 } else {
1110 break; 1133 break;
1111 } 1134 }
1112 } else { 1135 } else {
1113 i++; 1136 if (ret > 0) {
1137 i += ret;
1138 last_len += ret << PAGE_CACHE_SHIFT;
1139 } else {
1140 i++;
1141 last_len = 0;
1142 }
1114 } 1143 }
1115 } 1144 }
1116 1145
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1136 mutex_unlock(&inode->i_mutex); 1165 mutex_unlock(&inode->i_mutex);
1137 } 1166 }
1138 1167
1139 disk_super = &root->fs_info->super_copy; 1168 disk_super = root->fs_info->super_copy;
1140 features = btrfs_super_incompat_flags(disk_super); 1169 features = btrfs_super_incompat_flags(disk_super);
1141 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1170 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1142 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1171 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1143 btrfs_set_super_incompat_flags(disk_super, features); 1172 btrfs_set_super_incompat_flags(disk_super, features);
1144 } 1173 }
1145 1174
1146 if (!file) 1175 ret = defrag_count;
1147 kfree(ra);
1148 return defrag_count;
1149 1176
1150out_ra: 1177out_ra:
1151 if (!file) 1178 if (!file)
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2587 return PTR_ERR(trans); 2614 return PTR_ERR(trans);
2588 } 2615 }
2589 2616
2590 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2617 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2591 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2618 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2592 dir_id, "default", 7, 1); 2619 dir_id, "default", 7, 1);
2593 if (IS_ERR_OR_NULL(di)) { 2620 if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2603 btrfs_mark_buffer_dirty(path->nodes[0]); 2630 btrfs_mark_buffer_dirty(path->nodes[0]);
2604 btrfs_free_path(path); 2631 btrfs_free_path(path);
2605 2632
2606 disk_super = &root->fs_info->super_copy; 2633 disk_super = root->fs_info->super_copy;
2607 features = btrfs_super_incompat_flags(disk_super); 2634 features = btrfs_super_incompat_flags(disk_super);
2608 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2635 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2609 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2636 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2891,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2864 return ret; 2891 return ret;
2865} 2892}
2866 2893
2894static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2895{
2896 int ret = 0;
2897 int i;
2898 u64 rel_ptr;
2899 int size;
2900 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2901 struct inode_fs_paths *ipath = NULL;
2902 struct btrfs_path *path;
2903
2904 if (!capable(CAP_SYS_ADMIN))
2905 return -EPERM;
2906
2907 path = btrfs_alloc_path();
2908 if (!path) {
2909 ret = -ENOMEM;
2910 goto out;
2911 }
2912
2913 ipa = memdup_user(arg, sizeof(*ipa));
2914 if (IS_ERR(ipa)) {
2915 ret = PTR_ERR(ipa);
2916 ipa = NULL;
2917 goto out;
2918 }
2919
2920 size = min_t(u32, ipa->size, 4096);
2921 ipath = init_ipath(size, root, path);
2922 if (IS_ERR(ipath)) {
2923 ret = PTR_ERR(ipath);
2924 ipath = NULL;
2925 goto out;
2926 }
2927
2928 ret = paths_from_inode(ipa->inum, ipath);
2929 if (ret < 0)
2930 goto out;
2931
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
2934 ipath->fspath->val[i] = rel_ptr;
2935 }
2936
2937 ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
2938 if (ret) {
2939 ret = -EFAULT;
2940 goto out;
2941 }
2942
2943out:
2944 btrfs_free_path(path);
2945 free_ipath(ipath);
2946 kfree(ipa);
2947
2948 return ret;
2949}
2950
2951static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2952{
2953 struct btrfs_data_container *inodes = ctx;
2954 const size_t c = 3 * sizeof(u64);
2955
2956 if (inodes->bytes_left >= c) {
2957 inodes->bytes_left -= c;
2958 inodes->val[inodes->elem_cnt] = inum;
2959 inodes->val[inodes->elem_cnt + 1] = offset;
2960 inodes->val[inodes->elem_cnt + 2] = root;
2961 inodes->elem_cnt += 3;
2962 } else {
2963 inodes->bytes_missing += c - inodes->bytes_left;
2964 inodes->bytes_left = 0;
2965 inodes->elem_missed += 3;
2966 }
2967
2968 return 0;
2969}
2970
2971static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2972 void __user *arg)
2973{
2974 int ret = 0;
2975 int size;
2976 u64 extent_offset;
2977 struct btrfs_ioctl_logical_ino_args *loi;
2978 struct btrfs_data_container *inodes = NULL;
2979 struct btrfs_path *path = NULL;
2980 struct btrfs_key key;
2981
2982 if (!capable(CAP_SYS_ADMIN))
2983 return -EPERM;
2984
2985 loi = memdup_user(arg, sizeof(*loi));
2986 if (IS_ERR(loi)) {
2987 ret = PTR_ERR(loi);
2988 loi = NULL;
2989 goto out;
2990 }
2991
2992 path = btrfs_alloc_path();
2993 if (!path) {
2994 ret = -ENOMEM;
2995 goto out;
2996 }
2997
2998 size = min_t(u32, loi->size, 4096);
2999 inodes = init_data_container(size);
3000 if (IS_ERR(inodes)) {
3001 ret = PTR_ERR(inodes);
3002 inodes = NULL;
3003 goto out;
3004 }
3005
3006 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3007
3008 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3009 ret = -ENOENT;
3010 if (ret < 0)
3011 goto out;
3012
3013 extent_offset = loi->logical - key.objectid;
3014 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3015 extent_offset, build_ino_list, inodes);
3016
3017 if (ret < 0)
3018 goto out;
3019
3020 ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
3021 if (ret)
3022 ret = -EFAULT;
3023
3024out:
3025 btrfs_free_path(path);
3026 kfree(inodes);
3027 kfree(loi);
3028
3029 return ret;
3030}
3031
2867long btrfs_ioctl(struct file *file, unsigned int 3032long btrfs_ioctl(struct file *file, unsigned int
2868 cmd, unsigned long arg) 3033 cmd, unsigned long arg)
2869{ 3034{
@@ -2921,6 +3086,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2921 return btrfs_ioctl_tree_search(file, argp); 3086 return btrfs_ioctl_tree_search(file, argp);
2922 case BTRFS_IOC_INO_LOOKUP: 3087 case BTRFS_IOC_INO_LOOKUP:
2923 return btrfs_ioctl_ino_lookup(file, argp); 3088 return btrfs_ioctl_ino_lookup(file, argp);
3089 case BTRFS_IOC_INO_PATHS:
3090 return btrfs_ioctl_ino_to_path(root, argp);
3091 case BTRFS_IOC_LOGICAL_INO:
3092 return btrfs_ioctl_logical_to_ino(root, argp);
2924 case BTRFS_IOC_SPACE_INFO: 3093 case BTRFS_IOC_SPACE_INFO:
2925 return btrfs_ioctl_space_info(root, argp); 3094 return btrfs_ioctl_space_info(root, argp);
2926 case BTRFS_IOC_SYNC: 3095 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..252ae9915de8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 __u64 val[0]; /* out */
202};
203
204struct btrfs_ioctl_ino_path_args {
205 __u64 inum; /* in */
206 __u32 size; /* in */
207 __u64 reserved[4];
208 /* struct btrfs_data_container *fspath; out */
209 __u64 fspath; /* out */
210};
211
212struct btrfs_ioctl_logical_ino_args {
213 __u64 logical; /* in */
214 __u32 size; /* in */
215 __u64 reserved[4];
216 /* struct btrfs_data_container *inodes; out */
217 __u64 inodes;
218};
219
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 220#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 221 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 222#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 272 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 274 struct btrfs_ioctl_fs_info_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
278 struct btrfs_ioctl_ino_path_args)
279
251#endif 280#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2373b39a132b
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *bbio)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < bbio->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = bbio->stripes[i].dev;
303 }
304 zone->ndevs = bbio->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *bbio = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
365 if (ret || !bbio || length < blocksize)
366 goto error;
367
368 if (bbio->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = bbio->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = bbio->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = bbio->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 kfree(bbio);
427 return re;
428
429error:
430 while (nzones) {
431 struct reada_zone *zone;
432
433 --nzones;
434 zone = re->zones[nzones];
435 kref_get(&zone->refcnt);
436 spin_lock(&zone->lock);
437 --zone->elems;
438 if (zone->elems == 0) {
439 /*
440 * no fs_info->reada_lock needed, as this can't be
441 * the last ref
442 */
443 kref_put(&zone->refcnt, reada_zone_release);
444 }
445 spin_unlock(&zone->lock);
446
447 spin_lock(&fs_info->reada_lock);
448 kref_put(&zone->refcnt, reada_zone_release);
449 spin_unlock(&fs_info->reada_lock);
450 }
451 kfree(bbio);
452 kfree(re);
453 if (looped)
454 goto again;
455 return NULL;
456}
457
458static void reada_kref_dummy(struct kref *kr)
459{
460}
461
462static void reada_extent_put(struct btrfs_fs_info *fs_info,
463 struct reada_extent *re)
464{
465 int i;
466 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
467
468 spin_lock(&fs_info->reada_lock);
469 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
470 spin_unlock(&fs_info->reada_lock);
471 return;
472 }
473
474 radix_tree_delete(&fs_info->reada_tree, index);
475 for (i = 0; i < re->nzones; ++i) {
476 struct reada_zone *zone = re->zones[i];
477
478 radix_tree_delete(&zone->device->reada_extents, index);
479 }
480
481 spin_unlock(&fs_info->reada_lock);
482
483 for (i = 0; i < re->nzones; ++i) {
484 struct reada_zone *zone = re->zones[i];
485
486 kref_get(&zone->refcnt);
487 spin_lock(&zone->lock);
488 --zone->elems;
489 if (zone->elems == 0) {
490 /* no fs_info->reada_lock needed, as this can't be
491 * the last ref */
492 kref_put(&zone->refcnt, reada_zone_release);
493 }
494 spin_unlock(&zone->lock);
495
496 spin_lock(&fs_info->reada_lock);
497 kref_put(&zone->refcnt, reada_zone_release);
498 spin_unlock(&fs_info->reada_lock);
499 }
500 if (re->scheduled_for)
501 atomic_dec(&re->scheduled_for->reada_in_flight);
502
503 kfree(re);
504}
505
506static void reada_zone_release(struct kref *kref)
507{
508 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
509
510 radix_tree_delete(&zone->device->reada_zones,
511 zone->end >> PAGE_CACHE_SHIFT);
512
513 kfree(zone);
514}
515
516static void reada_control_release(struct kref *kref)
517{
518 struct reada_control *rc = container_of(kref, struct reada_control,
519 refcnt);
520
521 kfree(rc);
522}
523
524static int reada_add_block(struct reada_control *rc, u64 logical,
525 struct btrfs_key *top, int level, u64 generation)
526{
527 struct btrfs_root *root = rc->root;
528 struct reada_extent *re;
529 struct reada_extctl *rec;
530
531 re = reada_find_extent(root, logical, top, level); /* takes one ref */
532 if (!re)
533 return -1;
534
535 rec = kzalloc(sizeof(*rec), GFP_NOFS);
536 if (!rec) {
537 reada_extent_put(root->fs_info, re);
538 return -1;
539 }
540
541 rec->rc = rc;
542 rec->generation = generation;
543 atomic_inc(&rc->elems);
544
545 spin_lock(&re->lock);
546 list_add_tail(&rec->list, &re->extctl);
547 spin_unlock(&re->lock);
548
549 /* leave the ref on the extent */
550
551 return 0;
552}
553
554/*
555 * called with fs_info->reada_lock held
556 */
557static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
558{
559 int i;
560 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
561
562 for (i = 0; i < zone->ndevs; ++i) {
563 struct reada_zone *peer;
564 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
565 if (peer && peer->device != zone->device)
566 peer->locked = lock;
567 }
568}
569
570/*
571 * called with fs_info->reada_lock held
572 */
573static int reada_pick_zone(struct btrfs_device *dev)
574{
575 struct reada_zone *top_zone = NULL;
576 struct reada_zone *top_locked_zone = NULL;
577 u64 top_elems = 0;
578 u64 top_locked_elems = 0;
579 unsigned long index = 0;
580 int ret;
581
582 if (dev->reada_curr_zone) {
583 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
584 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
585 dev->reada_curr_zone = NULL;
586 }
587 /* pick the zone with the most elements */
588 while (1) {
589 struct reada_zone *zone;
590
591 ret = radix_tree_gang_lookup(&dev->reada_zones,
592 (void **)&zone, index, 1);
593 if (ret == 0)
594 break;
595 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
596 if (zone->locked) {
597 if (zone->elems > top_locked_elems) {
598 top_locked_elems = zone->elems;
599 top_locked_zone = zone;
600 }
601 } else {
602 if (zone->elems > top_elems) {
603 top_elems = zone->elems;
604 top_zone = zone;
605 }
606 }
607 }
608 if (top_zone)
609 dev->reada_curr_zone = top_zone;
610 else if (top_locked_zone)
611 dev->reada_curr_zone = top_locked_zone;
612 else
613 return 0;
614
615 dev->reada_next = dev->reada_curr_zone->start;
616 kref_get(&dev->reada_curr_zone->refcnt);
617 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
618
619 return 1;
620}
621
622static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
623 struct btrfs_device *dev)
624{
625 struct reada_extent *re = NULL;
626 int mirror_num = 0;
627 struct extent_buffer *eb = NULL;
628 u64 logical;
629 u32 blocksize;
630 int ret;
631 int i;
632 int need_kick = 0;
633
634 spin_lock(&fs_info->reada_lock);
635 if (dev->reada_curr_zone == NULL) {
636 ret = reada_pick_zone(dev);
637 if (!ret) {
638 spin_unlock(&fs_info->reada_lock);
639 return 0;
640 }
641 }
642 /*
643 * FIXME currently we issue the reads one extent at a time. If we have
644 * a contiguous block of extents, we could also coagulate them or use
645 * plugging to speed things up
646 */
647 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
648 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
649 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
650 ret = reada_pick_zone(dev);
651 if (!ret) {
652 spin_unlock(&fs_info->reada_lock);
653 return 0;
654 }
655 re = NULL;
656 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
657 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
658 }
659 if (ret == 0) {
660 spin_unlock(&fs_info->reada_lock);
661 return 0;
662 }
663 dev->reada_next = re->logical + re->blocksize;
664 kref_get(&re->refcnt);
665
666 spin_unlock(&fs_info->reada_lock);
667
668 /*
669 * find mirror num
670 */
671 for (i = 0; i < re->nzones; ++i) {
672 if (re->zones[i]->device == dev) {
673 mirror_num = i + 1;
674 break;
675 }
676 }
677 logical = re->logical;
678 blocksize = re->blocksize;
679
680 spin_lock(&re->lock);
681 if (re->scheduled_for == NULL) {
682 re->scheduled_for = dev;
683 need_kick = 1;
684 }
685 spin_unlock(&re->lock);
686
687 reada_extent_put(fs_info, re);
688
689 if (!need_kick)
690 return 0;
691
692 atomic_inc(&dev->reada_in_flight);
693 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
694 mirror_num, &eb);
695 if (ret)
696 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
697 else if (eb)
698 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
699
700 if (eb)
701 free_extent_buffer(eb);
702
703 return 1;
704
705}
706
707static void reada_start_machine_worker(struct btrfs_work *work)
708{
709 struct reada_machine_work *rmw;
710 struct btrfs_fs_info *fs_info;
711
712 rmw = container_of(work, struct reada_machine_work, work);
713 fs_info = rmw->fs_info;
714
715 kfree(rmw);
716
717 __reada_start_machine(fs_info);
718}
719
720static void __reada_start_machine(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_device *device;
723 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
724 u64 enqueued;
725 u64 total = 0;
726 int i;
727
728 do {
729 enqueued = 0;
730 list_for_each_entry(device, &fs_devices->devices, dev_list) {
731 if (atomic_read(&device->reada_in_flight) <
732 MAX_IN_FLIGHT)
733 enqueued += reada_start_machine_dev(fs_info,
734 device);
735 }
736 total += enqueued;
737 } while (enqueued && total < 10000);
738
739 if (enqueued == 0)
740 return;
741
742 /*
743 * If everything is already in the cache, this is effectively single
744 * threaded. To a) not hold the caller for too long and b) to utilize
745 * more cores, we broke the loop above after 10000 iterations and now
746 * enqueue to workers to finish it. This will distribute the load to
747 * the cores.
748 */
749 for (i = 0; i < 2; ++i)
750 reada_start_machine(fs_info);
751}
752
753static void reada_start_machine(struct btrfs_fs_info *fs_info)
754{
755 struct reada_machine_work *rmw;
756
757 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
758 if (!rmw) {
759 /* FIXME we cannot handle this properly right now */
760 BUG();
761 }
762 rmw->work.func = reada_start_machine_worker;
763 rmw->fs_info = fs_info;
764
765 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
766}
767
768#ifdef DEBUG
769static void dump_devs(struct btrfs_fs_info *fs_info, int all)
770{
771 struct btrfs_device *device;
772 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
773 unsigned long index;
774 int ret;
775 int i;
776 int j;
777 int cnt;
778
779 spin_lock(&fs_info->reada_lock);
780 list_for_each_entry(device, &fs_devices->devices, dev_list) {
781 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
782 atomic_read(&device->reada_in_flight));
783 index = 0;
784 while (1) {
785 struct reada_zone *zone;
786 ret = radix_tree_gang_lookup(&device->reada_zones,
787 (void **)&zone, index, 1);
788 if (ret == 0)
789 break;
790 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
791 "%d devs", zone->start, zone->end, zone->elems,
792 zone->locked);
793 for (j = 0; j < zone->ndevs; ++j) {
794 printk(KERN_CONT " %lld",
795 zone->devs[j]->devid);
796 }
797 if (device->reada_curr_zone == zone)
798 printk(KERN_CONT " curr off %llu",
799 device->reada_next - zone->start);
800 printk(KERN_CONT "\n");
801 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
802 }
803 cnt = 0;
804 index = 0;
805 while (all) {
806 struct reada_extent *re = NULL;
807
808 ret = radix_tree_gang_lookup(&device->reada_extents,
809 (void **)&re, index, 1);
810 if (ret == 0)
811 break;
812 printk(KERN_DEBUG
813 " re: logical %llu size %u empty %d for %lld",
814 re->logical, re->blocksize,
815 list_empty(&re->extctl), re->scheduled_for ?
816 re->scheduled_for->devid : -1);
817
818 for (i = 0; i < re->nzones; ++i) {
819 printk(KERN_CONT " zone %llu-%llu devs",
820 re->zones[i]->start,
821 re->zones[i]->end);
822 for (j = 0; j < re->zones[i]->ndevs; ++j) {
823 printk(KERN_CONT " %lld",
824 re->zones[i]->devs[j]->devid);
825 }
826 }
827 printk(KERN_CONT "\n");
828 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
829 if (++cnt > 15)
830 break;
831 }
832 }
833
834 index = 0;
835 cnt = 0;
836 while (all) {
837 struct reada_extent *re = NULL;
838
839 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
840 index, 1);
841 if (ret == 0)
842 break;
843 if (!re->scheduled_for) {
844 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
845 continue;
846 }
847 printk(KERN_DEBUG
848 "re: logical %llu size %u list empty %d for %lld",
849 re->logical, re->blocksize, list_empty(&re->extctl),
850 re->scheduled_for ? re->scheduled_for->devid : -1);
851 for (i = 0; i < re->nzones; ++i) {
852 printk(KERN_CONT " zone %llu-%llu devs",
853 re->zones[i]->start,
854 re->zones[i]->end);
855 for (i = 0; i < re->nzones; ++i) {
856 printk(KERN_CONT " zone %llu-%llu devs",
857 re->zones[i]->start,
858 re->zones[i]->end);
859 for (j = 0; j < re->zones[i]->ndevs; ++j) {
860 printk(KERN_CONT " %lld",
861 re->zones[i]->devs[j]->devid);
862 }
863 }
864 }
865 printk(KERN_CONT "\n");
866 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
867 }
868 spin_unlock(&fs_info->reada_lock);
869}
870#endif
871
872/*
873 * interface
874 */
875struct reada_control *btrfs_reada_add(struct btrfs_root *root,
876 struct btrfs_key *key_start, struct btrfs_key *key_end)
877{
878 struct reada_control *rc;
879 u64 start;
880 u64 generation;
881 int level;
882 struct extent_buffer *node;
883 static struct btrfs_key max_key = {
884 .objectid = (u64)-1,
885 .type = (u8)-1,
886 .offset = (u64)-1
887 };
888
889 rc = kzalloc(sizeof(*rc), GFP_NOFS);
890 if (!rc)
891 return ERR_PTR(-ENOMEM);
892
893 rc->root = root;
894 rc->key_start = *key_start;
895 rc->key_end = *key_end;
896 atomic_set(&rc->elems, 0);
897 init_waitqueue_head(&rc->wait);
898 kref_init(&rc->refcnt);
899 kref_get(&rc->refcnt); /* one ref for having elements */
900
901 node = btrfs_root_node(root);
902 start = node->start;
903 level = btrfs_header_level(node);
904 generation = btrfs_header_generation(node);
905 free_extent_buffer(node);
906
907 reada_add_block(rc, start, &max_key, level, generation);
908
909 reada_start_machine(root->fs_info);
910
911 return rc;
912}
913
914#ifdef DEBUG
915int btrfs_reada_wait(void *handle)
916{
917 struct reada_control *rc = handle;
918
919 while (atomic_read(&rc->elems)) {
920 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
921 5 * HZ);
922 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
923 }
924
925 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
926
927 kref_put(&rc->refcnt, reada_control_release);
928
929 return 0;
930}
931#else
932int btrfs_reada_wait(void *handle)
933{
934 struct reada_control *rc = handle;
935
936 while (atomic_read(&rc->elems)) {
937 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
938 }
939
940 kref_put(&rc->refcnt, reada_control_release);
941
942 return 0;
943}
944#endif
945
946void btrfs_reada_detach(void *handle)
947{
948 struct reada_control *rc = handle;
949
950 kref_put(&rc->refcnt, reada_control_release);
951}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..24d654ce7a06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2041 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2042 trans->block_rsv = rc->block_rsv;
2043 2043
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2044 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2045 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2046 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2047 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2151,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2151again:
2153 if (!err) { 2152 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2153 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2154 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2155 if (ret)
2158 err = ret; 2156 err = ret;
2159 } 2157 }
@@ -2427,7 +2425,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2425 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2426
2429 trans->block_rsv = rc->block_rsv; 2427 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2428 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2429 if (ret) {
2432 if (ret == -EAGAIN) 2430 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2431 rc->commit_transaction = 1;
@@ -2922,6 +2920,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2920 unsigned long last_index;
2923 struct page *page; 2921 struct page *page;
2924 struct file_ra_state *ra; 2922 struct file_ra_state *ra;
2923 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2924 int nr = 0;
2926 int ret = 0; 2925 int ret = 0;
2927 2926
@@ -2956,7 +2955,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2955 ra, NULL, index,
2957 last_index + 1 - index); 2956 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2957 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2958 mask);
2960 if (!page) { 2959 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2960 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2961 PAGE_CACHE_SIZE);
@@ -3323,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3322 }
3324 3323
3325 key.objectid = ref_objectid; 3324 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3325 key.type = BTRFS_EXTENT_DATA_KEY;
3326 if (ref_offset > ((u64)-1 << 32))
3327 key.offset = 0;
3328 else
3329 key.offset = ref_offset;
3328 3330
3329 path->search_commit_root = 1; 3331 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3332 path->skip_locking = 1;
@@ -3645,14 +3647,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3647 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3648 * is no reservation in transaction handle.
3647 */ 3649 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3650 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3651 rc->extent_root->nodesize * 256);
3650 if (ret) 3652 if (ret)
3651 return ret; 3653 return ret;
3652 3654
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3655 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3656 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3657 rc->extents_found = 0;
@@ -3777,8 +3776,7 @@ restart:
3777 } 3776 }
3778 } 3777 }
3779 3778
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3779 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3780 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3781 if (ret != -EAGAIN) {
3784 err = ret; 3782 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..ed11d3866afd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,361 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 ret = paths_from_inode(inum, ipath);
260
261 if (ret < 0)
262 goto err;
263
264 /*
265 * we deliberately ignore the bit ipath might have been too small to
266 * hold all of the paths here
267 */
268 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
269 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
270 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
271 "length %llu, links %u (path: %s)\n", swarn->errstr,
272 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 (char *)ipath->fspath->val[i]);
276
277 free_ipath(ipath);
278 return 0;
279
280err:
281 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
282 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
283 "resolving failed with ret=%d\n", swarn->errstr,
284 swarn->logical, swarn->dev->name,
285 (unsigned long long)swarn->sector, root, inum, offset, ret);
286
287 free_ipath(ipath);
288 return 0;
289}
290
291static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
292 int ix)
293{
294 struct btrfs_device *dev = sbio->sdev->dev;
295 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
296 struct btrfs_path *path;
297 struct btrfs_key found_key;
298 struct extent_buffer *eb;
299 struct btrfs_extent_item *ei;
300 struct scrub_warning swarn;
301 u32 item_size;
302 int ret;
303 u64 ref_root;
304 u8 ref_level;
305 unsigned long ptr = 0;
306 const int bufsize = 4096;
307 u64 extent_offset;
308
309 path = btrfs_alloc_path();
310
311 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
312 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
313 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
314 swarn.logical = sbio->logical + ix * PAGE_SIZE;
315 swarn.errstr = errstr;
316 swarn.dev = dev;
317 swarn.msg_bufsize = bufsize;
318 swarn.scratch_bufsize = bufsize;
319
320 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
321 goto out;
322
323 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
324 if (ret < 0)
325 goto out;
326
327 extent_offset = swarn.logical - found_key.objectid;
328 swarn.extent_item_size = found_key.offset;
329
330 eb = path->nodes[0];
331 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
332 item_size = btrfs_item_size_nr(eb, path->slots[0]);
333
334 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
335 do {
336 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
337 &ref_root, &ref_level);
338 printk(KERN_WARNING "%s at logical %llu on dev %s, "
339 "sector %llu: metadata %s (level %d) in tree "
340 "%llu\n", errstr, swarn.logical, dev->name,
341 (unsigned long long)swarn.sector,
342 ref_level ? "node" : "leaf",
343 ret < 0 ? -1 : ref_level,
344 ret < 0 ? -1 : ref_root);
345 } while (ret != 1);
346 } else {
347 swarn.path = path;
348 iterate_extent_inodes(fs_info, path, found_key.objectid,
349 extent_offset,
350 scrub_print_warning_inode, &swarn);
351 }
352
353out:
354 btrfs_free_path(path);
355 kfree(swarn.scratch_buf);
356 kfree(swarn.msg_buf);
357}
358
359static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
360{
361 struct page *page = NULL;
362 unsigned long index;
363 struct scrub_fixup_nodatasum *fixup = ctx;
364 int ret;
365 int corrected = 0;
366 struct btrfs_key key;
367 struct inode *inode = NULL;
368 u64 end = offset + PAGE_SIZE - 1;
369 struct btrfs_root *local_root;
370
371 key.objectid = root;
372 key.type = BTRFS_ROOT_ITEM_KEY;
373 key.offset = (u64)-1;
374 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
375 if (IS_ERR(local_root))
376 return PTR_ERR(local_root);
377
378 key.type = BTRFS_INODE_ITEM_KEY;
379 key.objectid = inum;
380 key.offset = 0;
381 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
382 if (IS_ERR(inode))
383 return PTR_ERR(inode);
384
385 index = offset >> PAGE_CACHE_SHIFT;
386
387 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
388 if (!page) {
389 ret = -ENOMEM;
390 goto out;
391 }
392
393 if (PageUptodate(page)) {
394 struct btrfs_mapping_tree *map_tree;
395 if (PageDirty(page)) {
396 /*
397 * we need to write the data to the defect sector. the
398 * data that was in that sector is not in memory,
399 * because the page was modified. we must not write the
400 * modified page to that sector.
401 *
402 * TODO: what could be done here: wait for the delalloc
403 * runner to write out that page (might involve
404 * COW) and see whether the sector is still
405 * referenced afterwards.
406 *
407 * For the meantime, we'll treat this error
408 * incorrectable, although there is a chance that a
409 * later scrub will find the bad sector again and that
410 * there's no dirty page in memory, then.
411 */
412 ret = -EIO;
413 goto out;
414 }
415 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
416 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
417 fixup->logical, page,
418 fixup->mirror_num);
419 unlock_page(page);
420 corrected = !ret;
421 } else {
422 /*
423 * we need to get good data first. the general readpage path
424 * will call repair_io_failure for us, we just have to make
425 * sure we read the bad mirror.
426 */
427 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
428 EXTENT_DAMAGED, GFP_NOFS);
429 if (ret) {
430 /* set_extent_bits should give proper error */
431 WARN_ON(ret > 0);
432 if (ret > 0)
433 ret = -EFAULT;
434 goto out;
435 }
436
437 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
438 btrfs_get_extent,
439 fixup->mirror_num);
440 wait_on_page_locked(page);
441
442 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
443 end, EXTENT_DAMAGED, 0, NULL);
444 if (!corrected)
445 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
446 EXTENT_DAMAGED, GFP_NOFS);
447 }
448
449out:
450 if (page)
451 put_page(page);
452 if (inode)
453 iput(inode);
454
455 if (ret < 0)
456 return ret;
457
458 if (ret == 0 && corrected) {
459 /*
460 * we only need to call readpage for one of the inodes belonging
461 * to this extent. so make iterate_extent_inodes stop
462 */
463 return 1;
464 }
465
466 return -EIO;
467}
468
469static void scrub_fixup_nodatasum(struct btrfs_work *work)
470{
471 int ret;
472 struct scrub_fixup_nodatasum *fixup;
473 struct scrub_dev *sdev;
474 struct btrfs_trans_handle *trans = NULL;
475 struct btrfs_fs_info *fs_info;
476 struct btrfs_path *path;
477 int uncorrectable = 0;
478
479 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
480 sdev = fixup->sdev;
481 fs_info = fixup->root->fs_info;
482
483 path = btrfs_alloc_path();
484 if (!path) {
485 spin_lock(&sdev->stat_lock);
486 ++sdev->stat.malloc_errors;
487 spin_unlock(&sdev->stat_lock);
488 uncorrectable = 1;
489 goto out;
490 }
491
492 trans = btrfs_join_transaction(fixup->root);
493 if (IS_ERR(trans)) {
494 uncorrectable = 1;
495 goto out;
496 }
497
498 /*
499 * the idea is to trigger a regular read through the standard path. we
500 * read a page from the (failed) logical address by specifying the
501 * corresponding copynum of the failed sector. thus, that readpage is
502 * expected to fail.
503 * that is the point where on-the-fly error correction will kick in
504 * (once it's finished) and rewrite the failed sector if a good copy
505 * can be found.
506 */
507 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
508 path, scrub_fixup_readpage,
509 fixup);
510 if (ret < 0) {
511 uncorrectable = 1;
512 goto out;
513 }
514 WARN_ON(ret != 1);
515
516 spin_lock(&sdev->stat_lock);
517 ++sdev->stat.corrected_errors;
518 spin_unlock(&sdev->stat_lock);
519
520out:
521 if (trans && !IS_ERR(trans))
522 btrfs_end_transaction(trans, fixup->root);
523 if (uncorrectable) {
524 spin_lock(&sdev->stat_lock);
525 ++sdev->stat.uncorrectable_errors;
526 spin_unlock(&sdev->stat_lock);
527 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
528 "(nodatasum) error at logical %llu\n",
529 fixup->logical);
530 }
531
532 btrfs_free_path(path);
533 kfree(fixup);
534
535 /* see caller why we're pretending to be paused in the scrub counters */
536 mutex_lock(&fs_info->scrub_lock);
537 atomic_dec(&fs_info->scrubs_running);
538 atomic_dec(&fs_info->scrubs_paused);
539 mutex_unlock(&fs_info->scrub_lock);
540 atomic_dec(&sdev->fixup_cnt);
541 wake_up(&fs_info->scrub_pause_wait);
542 wake_up(&sdev->list_wait);
543}
544
198/* 545/*
199 * scrub_recheck_error gets called when either verification of the page 546 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 547 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 548 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 549 * one may be bad
203 */ 550 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 551static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 552{
553 struct scrub_dev *sdev = sbio->sdev;
554 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
555 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
556 DEFAULT_RATELIMIT_BURST);
557
206 if (sbio->err) { 558 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 559 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 560 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 561 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 562 return 0;
212 } 563 }
564 if (__ratelimit(&_rs))
565 scrub_print_warning("i/o error", sbio, ix);
566 } else {
567 if (__ratelimit(&_rs))
568 scrub_print_warning("checksum error", sbio, ix);
213 } 569 }
214 570
571 spin_lock(&sdev->stat_lock);
572 ++sdev->stat.read_errors;
573 spin_unlock(&sdev->stat_lock);
574
215 scrub_fixup(sbio, ix); 575 scrub_fixup(sbio, ix);
576 return 1;
216} 577}
217 578
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 579static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 611 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 612 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 613 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 614 struct btrfs_bio *bbio = NULL;
615 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 616 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 617 u64 length;
256 int i; 618 int i;
@@ -259,38 +621,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 621
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 622 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 623 (sbio->spag[ix].have_csum == 0)) {
624 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
625 if (!fixup)
626 goto uncorrectable;
627 fixup->sdev = sdev;
628 fixup->logical = logical;
629 fixup->root = fs_info->extent_root;
630 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 631 /*
263 * nodatasum, don't try to fix anything 632 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 633 * completing as long as a fixup worker is running. we must also
265 * writeback 634 * increment scrubs_paused to prevent deadlocking on pause
635 * requests used for transactions commits (as the worker uses a
636 * transaction context). it is safe to regard the fixup worker
637 * as paused for all matters practical. effectively, we only
638 * avoid cancellation requests from completing.
266 */ 639 */
267 goto uncorrectable; 640 mutex_lock(&fs_info->scrub_lock);
641 atomic_inc(&fs_info->scrubs_running);
642 atomic_inc(&fs_info->scrubs_paused);
643 mutex_unlock(&fs_info->scrub_lock);
644 atomic_inc(&sdev->fixup_cnt);
645 fixup->work.func = scrub_fixup_nodatasum;
646 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
647 return;
268 } 648 }
269 649
270 length = PAGE_SIZE; 650 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 651 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 652 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 653 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 654 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 655 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 656 (unsigned long long)logical);
277 WARN_ON(1); 657 WARN_ON(1);
658 kfree(bbio);
278 return; 659 return;
279 } 660 }
280 661
281 if (multi->num_stripes == 1) 662 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 663 /* there aren't any replicas */
283 goto uncorrectable; 664 goto uncorrectable;
284 665
285 /* 666 /*
286 * first find a good copy 667 * first find a good copy
287 */ 668 */
288 for (i = 0; i < multi->num_stripes; ++i) { 669 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 670 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 671 continue;
291 672
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 673 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 674 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 675 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 676 /* I/O-error, this is not a good copy */
296 continue; 677 continue;
@@ -299,7 +680,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 680 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 681 break;
301 } 682 }
302 if (i == multi->num_stripes) 683 if (i == bbio->num_stripes)
303 goto uncorrectable; 684 goto uncorrectable;
304 685
305 if (!sdev->readonly) { 686 if (!sdev->readonly) {
@@ -314,25 +695,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 695 }
315 } 696 }
316 697
317 kfree(multi); 698 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 699 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 700 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 701 spin_unlock(&sdev->stat_lock);
321 702
322 if (printk_ratelimit()) 703 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 704 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 705 return;
326 706
327uncorrectable: 707uncorrectable:
328 kfree(multi); 708 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 709 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 710 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 711 spin_unlock(&sdev->stat_lock);
332 712
333 if (printk_ratelimit()) 713 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 714 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 715}
337 716
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 717static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +761,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 761 int ret;
383 762
384 if (sbio->err) { 763 if (sbio->err) {
764 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 765 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 766 ret |= scrub_recheck_error(sbio, i);
767 if (!ret) {
768 spin_lock(&sdev->stat_lock);
769 ++sdev->stat.unverified_errors;
770 spin_unlock(&sdev->stat_lock);
771 }
387 772
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 773 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 774 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +781,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 781 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 782 bi->bv_len = PAGE_SIZE;
398 } 783 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 784 goto out;
404 } 785 }
405 for (i = 0; i < sbio->count; ++i) { 786 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +801,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 801 WARN_ON(1);
421 } 802 }
422 kunmap_atomic(buffer, KM_USER0); 803 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 804 if (ret) {
424 scrub_recheck_error(sbio, i); 805 ret = scrub_recheck_error(sbio, i);
806 if (!ret) {
807 spin_lock(&sdev->stat_lock);
808 ++sdev->stat.unverified_errors;
809 spin_unlock(&sdev->stat_lock);
810 }
811 }
425 } 812 }
426 813
427out: 814out:
@@ -604,7 +991,7 @@ nomem:
604} 991}
605 992
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 993static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 994 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 995 u8 *csum, int force)
609{ 996{
610 struct scrub_bio *sbio; 997 struct scrub_bio *sbio;
@@ -701,7 +1088,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1088
702/* scrub extent tries to collect up to 64 kB for each bio */ 1089/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1090static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1091 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1092{
706 int ret; 1093 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1094 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1128,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1128 int slot;
742 int i; 1129 int i;
743 u64 nstripes; 1130 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1131 struct extent_buffer *l;
746 struct btrfs_key key; 1132 struct btrfs_key key;
747 u64 physical; 1133 u64 physical;
748 u64 logical; 1134 u64 logical;
749 u64 generation; 1135 u64 generation;
750 u64 mirror_num; 1136 int mirror_num;
1137 struct reada_control *reada1;
1138 struct reada_control *reada2;
1139 struct btrfs_key key_start;
1140 struct btrfs_key key_end;
751 1141
752 u64 increment = map->stripe_len; 1142 u64 increment = map->stripe_len;
753 u64 offset; 1143 u64 offset;
@@ -758,102 +1148,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1148 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1149 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1150 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1151 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1152 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1153 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1154 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1155 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1156 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1157 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1158 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1159 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1160 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1161 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1162 mirror_num = num % map->num_stripes + 1;
773 } else { 1163 } else {
774 increment = map->stripe_len; 1164 increment = map->stripe_len;
775 mirror_num = 0; 1165 mirror_num = 1;
776 } 1166 }
777 1167
778 path = btrfs_alloc_path(); 1168 path = btrfs_alloc_path();
779 if (!path) 1169 if (!path)
780 return -ENOMEM; 1170 return -ENOMEM;
781 1171
782 path->reada = 2;
783 path->search_commit_root = 1; 1172 path->search_commit_root = 1;
784 path->skip_locking = 1; 1173 path->skip_locking = 1;
785 1174
786 /* 1175 /*
787 * find all extents for each stripe and just read them to get 1176 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1177 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1178 * to not hold off transaction commits
790 */ 1179 */
791 logical = base + offset; 1180 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 1181
817 break; 1182 wait_event(sdev->list_wait,
818 } 1183 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 1184 atomic_inc(&fs_info->scrubs_paused);
1185 wake_up(&fs_info->scrub_pause_wait);
820 1186
821 if (key.objectid >= logical + map->stripe_len) 1187 /* FIXME it might be better to start readahead at commit root */
822 break; 1188 key_start.objectid = logical;
1189 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1190 key_start.offset = (u64)0;
1191 key_end.objectid = base + offset + nstripes * increment;
1192 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1193 key_end.offset = (u64)0;
1194 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1195
1196 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1197 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1198 key_start.offset = logical;
1199 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1200 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1201 key_end.offset = base + offset + nstripes * increment;
1202 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1203
1204 if (!IS_ERR(reada1))
1205 btrfs_reada_wait(reada1);
1206 if (!IS_ERR(reada2))
1207 btrfs_reada_wait(reada2);
823 1208
824 path->slots[0]++; 1209 mutex_lock(&fs_info->scrub_lock);
825 } 1210 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1211 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1212 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1213 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1214 mutex_lock(&fs_info->scrub_lock);
830 } 1215 }
1216 atomic_dec(&fs_info->scrubs_paused);
1217 mutex_unlock(&fs_info->scrub_lock);
1218 wake_up(&fs_info->scrub_pause_wait);
831 1219
832 /* 1220 /*
833 * collect all data csums for the stripe to avoid seeking during 1221 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1222 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1223 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1224 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1225
847 logical += increment;
848 cond_resched();
849 }
850 /* 1226 /*
851 * now find all extents for each stripe and scrub them 1227 * now find all extents for each stripe and scrub them
852 */ 1228 */
853 logical = base + offset + start_stripe * increment; 1229 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1230 physical = map->stripes[num].physical;
855 ret = 0; 1231 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1232 for (i = 0; i < nstripes; ++i) {
857 /* 1233 /*
858 * canceled? 1234 * canceled?
859 */ 1235 */
@@ -882,11 +1258,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1258 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1259 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1260 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1261 }
889 1262
1263 ret = btrfs_lookup_csums_range(csum_root, logical,
1264 logical + map->stripe_len - 1,
1265 &sdev->csum_list, 1);
1266 if (ret)
1267 goto out;
1268
890 key.objectid = logical; 1269 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1270 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1271 key.offset = (u64)0;
@@ -982,7 +1361,6 @@ next:
982 1361
983out: 1362out:
984 blk_finish_plug(&plug); 1363 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1364 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1365 return ret < 0 ? ret : 0;
988} 1366}
@@ -1253,10 +1631,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1631 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1632
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1633 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1634 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1635 wake_up(&fs_info->scrub_pause_wait);
1259 1636
1637 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1638
1260 if (progress) 1639 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1640 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1641
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..57080dffdfc6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
43#include "compat.h" 44#include "compat.h"
44#include "delayed-inode.h" 45#include "delayed-inode.h"
45#include "ctree.h" 46#include "ctree.h"
@@ -58,6 +59,7 @@
58#include <trace/events/btrfs.h> 59#include <trace/events/btrfs.h>
59 60
60static const struct super_operations btrfs_super_ops; 61static const struct super_operations btrfs_super_ops;
62static struct file_system_type btrfs_fs_type;
61 63
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 64static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 65 char nbuf[16])
@@ -162,7 +164,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 168};
167 169
168static match_table_t tokens = { 170static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 197 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 198 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 199 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "no_space_cache"},
201 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 202 {Opt_err, NULL},
199}; 203};
200 204
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 210{
207 struct btrfs_fs_info *info = root->fs_info; 211 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 212 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 213 char *p, *num, *orig = NULL;
214 u64 cache_gen;
210 int intarg; 215 int intarg;
211 int ret = 0; 216 int ret = 0;
212 char *compress_type; 217 char *compress_type;
213 bool compress_force = false; 218 bool compress_force = false;
214 219
220 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
221 if (cache_gen)
222 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
223
215 if (!options) 224 if (!options)
216 return 0; 225 goto out;
217 226
218 /* 227 /*
219 * strsep changes the string, duplicate it because parse_options 228 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 369 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 370 break;
362 case Opt_space_cache: 371 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 372 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 373 break;
374 case Opt_no_space_cache:
375 printk(KERN_INFO "btrfs: disabling disk space caching\n");
376 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
377 break;
366 case Opt_inode_cache: 378 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 379 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 380 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 393 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 394 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 395 break;
396 case Opt_recovery:
397 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break;
384 case Opt_err: 400 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 401 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 402 "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 407 }
392 } 408 }
393out: 409out:
410 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
411 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 412 kfree(orig);
395 return ret; 413 return ret;
396} 414}
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 424 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 425{
408 substring_t args[MAX_OPT_ARGS]; 426 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 427 char *device_name, *opts, *orig, *p;
410 int error = 0; 428 int error = 0;
411 int intarg; 429 int intarg;
412 430
413 if (!options) 431 if (!options)
414 goto out; 432 return 0;
415 433
416 /* 434 /*
417 * strsep changes the string, duplicate it because parse_options 435 * strsep changes the string, duplicate it because parse_options
@@ -457,29 +475,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 475 }
458 break; 476 break;
459 case Opt_device: 477 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 478 device_name = match_strdup(&args[0]);
479 if (!device_name) {
480 error = -ENOMEM;
481 goto out;
482 }
483 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 484 flags, holder, fs_devices);
485 kfree(device_name);
462 if (error) 486 if (error)
463 goto out_free_opts; 487 goto out;
464 break; 488 break;
465 default: 489 default:
466 break; 490 break;
467 } 491 }
468 } 492 }
469 493
470 out_free_opts: 494out:
471 kfree(orig); 495 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 496 return error;
484} 497}
485 498
@@ -492,7 +505,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 505 struct btrfs_path *path;
493 struct btrfs_key location; 506 struct btrfs_key location;
494 struct inode *inode; 507 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 508 u64 dir_id;
497 int new = 0; 509 int new = 0;
498 510
@@ -517,7 +529,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 529 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 530 * to mount.
519 */ 531 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 532 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 533 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 534 if (IS_ERR(di)) {
523 btrfs_free_path(path); 535 btrfs_free_path(path);
@@ -566,29 +578,7 @@ setup_root:
566 return dget(sb->s_root); 578 return dget(sb->s_root);
567 } 579 }
568 580
569 if (new) { 581 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 582}
593 583
594static int btrfs_fill_super(struct super_block *sb, 584static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +709,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 709 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 710 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 711 seq_puts(seq, ",space_cache");
712 else
713 seq_puts(seq, ",no_space_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 714 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 715 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 716 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +745,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 745 return set_anon_super(s, data);
754} 746}
755 747
748/*
749 * subvolumes are identified by ino 256
750 */
751static inline int is_subvolume_inode(struct inode *inode)
752{
753 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
754 return 1;
755 return 0;
756}
757
758/*
759 * This will strip out the subvol=%s argument for an argument string and add
760 * subvolid=0 to make sure we get the actual tree root for path walking to the
761 * subvol we want.
762 */
763static char *setup_root_args(char *args)
764{
765 unsigned copied = 0;
766 unsigned len = strlen(args) + 2;
767 char *pos;
768 char *ret;
769
770 /*
771 * We need the same args as before, but minus
772 *
773 * subvol=a
774 *
775 * and add
776 *
777 * subvolid=0
778 *
779 * which is a difference of 2 characters, so we allocate strlen(args) +
780 * 2 characters.
781 */
782 ret = kzalloc(len * sizeof(char), GFP_NOFS);
783 if (!ret)
784 return NULL;
785 pos = strstr(args, "subvol=");
786
787 /* This shouldn't happen, but just in case.. */
788 if (!pos) {
789 kfree(ret);
790 return NULL;
791 }
792
793 /*
794 * The subvol=<> arg is not at the front of the string, copy everybody
795 * up to that into ret.
796 */
797 if (pos != args) {
798 *pos = '\0';
799 strcpy(ret, args);
800 copied += strlen(args);
801 pos++;
802 }
803
804 strncpy(ret + copied, "subvolid=0", len - copied);
805
806 /* Length of subvolid=0 */
807 copied += 10;
808
809 /*
810 * If there is no , after the subvol= option then we know there's no
811 * other options and we can just return.
812 */
813 pos = strchr(pos, ',');
814 if (!pos)
815 return ret;
816
817 /* Copy the rest of the arguments into our buffer */
818 strncpy(ret + copied, pos, len - copied);
819 copied += strlen(pos);
820
821 return ret;
822}
823
824static struct dentry *mount_subvol(const char *subvol_name, int flags,
825 const char *device_name, char *data)
826{
827 struct super_block *s;
828 struct dentry *root;
829 struct vfsmount *mnt;
830 struct mnt_namespace *ns_private;
831 char *newargs;
832 struct path path;
833 int error;
834
835 newargs = setup_root_args(data);
836 if (!newargs)
837 return ERR_PTR(-ENOMEM);
838 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
839 newargs);
840 kfree(newargs);
841 if (IS_ERR(mnt))
842 return ERR_CAST(mnt);
843
844 ns_private = create_mnt_ns(mnt);
845 if (IS_ERR(ns_private)) {
846 mntput(mnt);
847 return ERR_CAST(ns_private);
848 }
849
850 /*
851 * This will trigger the automount of the subvol so we can just
852 * drop the mnt we have here and return the dentry that we
853 * found.
854 */
855 error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
856 LOOKUP_FOLLOW, &path);
857 put_mnt_ns(ns_private);
858 if (error)
859 return ERR_PTR(error);
860
861 if (!is_subvolume_inode(path.dentry->d_inode)) {
862 path_put(&path);
863 mntput(mnt);
864 error = -EINVAL;
865 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
866 subvol_name);
867 return ERR_PTR(-EINVAL);
868 }
869
870 /* Get a ref to the sb and the dentry we found and return it */
871 s = path.mnt->mnt_sb;
872 atomic_inc(&s->s_active);
873 root = dget(path.dentry);
874 path_put(&path);
875 down_write(&s->s_umount);
876
877 return root;
878}
756 879
757/* 880/*
758 * Find a superblock for the given device / mount point. 881 * Find a superblock for the given device / mount point.
@@ -784,13 +907,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
784 if (error) 907 if (error)
785 return ERR_PTR(error); 908 return ERR_PTR(error);
786 909
910 if (subvol_name) {
911 root = mount_subvol(subvol_name, flags, device_name, data);
912 kfree(subvol_name);
913 return root;
914 }
915
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 916 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
788 if (error) 917 if (error)
789 goto error_free_subvol_name; 918 return ERR_PTR(error);
790 919
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 920 error = btrfs_open_devices(fs_devices, mode, fs_type);
792 if (error) 921 if (error)
793 goto error_free_subvol_name; 922 return ERR_PTR(error);
794 923
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { 924 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES; 925 error = -EACCES;
@@ -813,88 +942,57 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
813 fs_info->fs_devices = fs_devices; 942 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 943 tree_root->fs_info = fs_info;
815 944
945 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
946 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
947 if (!fs_info->super_copy || !fs_info->super_for_commit) {
948 error = -ENOMEM;
949 goto error_close_devices;
950 }
951
816 bdev = fs_devices->latest_bdev; 952 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 953 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
818 if (IS_ERR(s)) 954 if (IS_ERR(s)) {
819 goto error_s; 955 error = PTR_ERR(s);
956 goto error_close_devices;
957 }
820 958
821 if (s->s_root) { 959 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 960 if ((flags ^ s->s_flags) & MS_RDONLY) {
823 deactivate_locked_super(s); 961 deactivate_locked_super(s);
824 error = -EBUSY; 962 return ERR_PTR(-EBUSY);
825 goto error_close_devices;
826 } 963 }
827 964
828 btrfs_close_devices(fs_devices); 965 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 966 free_fs_info(fs_info);
830 kfree(tree_root); 967 kfree(tree_root);
831 } else { 968 } else {
832 char b[BDEVNAME_SIZE]; 969 char b[BDEVNAME_SIZE];
833 970
834 s->s_flags = flags | MS_NOSEC; 971 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 972 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
973 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 974 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 975 flags & MS_SILENT ? 1 : 0);
838 if (error) { 976 if (error) {
839 deactivate_locked_super(s); 977 deactivate_locked_super(s);
840 goto error_free_subvol_name; 978 return ERR_PTR(error);
841 } 979 }
842 980
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 981 s->s_flags |= MS_ACTIVE;
845 } 982 }
846 983
847 /* if they gave us a subvolume name bind mount into that */ 984 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 985 if (IS_ERR(root)) {
849 struct dentry *new_root; 986 deactivate_locked_super(s);
850 987 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 988 }
886 989
887 kfree(subvol_name);
888 return root; 990 return root;
889 991
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 992error_close_devices:
893 btrfs_close_devices(fs_devices); 993 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 994 free_fs_info(fs_info);
895 kfree(tree_root); 995 kfree(tree_root);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 996 return ERR_PTR(error);
899} 997}
900 998
@@ -919,7 +1017,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 1017 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 1018 return -EACCES;
921 1019
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 1020 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1021 return -EINVAL;
924 1022
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1023 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1085,7 +1183,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1183static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1184{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1185 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1186 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1187 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1188 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1189 u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..960835eaf4da 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56 56
57 spin_lock(&root->fs_info->trans_lock); 57 spin_lock(&root->fs_info->trans_lock);
58loop:
58 if (root->fs_info->trans_no_join) { 59 if (root->fs_info->trans_no_join) {
59 if (!nofail) { 60 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock); 61 spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans) 77 if (!cur_trans)
77 return -ENOMEM; 78 return -ENOMEM;
79
78 spin_lock(&root->fs_info->trans_lock); 80 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) { 81 if (root->fs_info->running_transaction) {
82 /*
83 * someone started a transaction after we unlocked. Make sure
84 * to redo the trans_no_join checks above
85 */
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 86 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction; 87 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count); 88 goto loop;
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 } 89 }
90
88 atomic_set(&cur_trans->num_writers, 1); 91 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0; 92 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait); 93 init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 278 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 279 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 280 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 281 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 282 &root->fs_info->trans_block_rsv,
280 num_bytes); 283 num_bytes);
281 if (ret) 284 if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 421 struct btrfs_root *root)
419{ 422{
420 int ret; 423 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 424
422 &root->fs_info->global_block_rsv, 0, 5); 425 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 426 return ret ? 1 : 0;
424} 427}
425 428
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 430 struct btrfs_root *root)
428{ 431{
429 struct btrfs_transaction *cur_trans = trans->transaction; 432 struct btrfs_transaction *cur_trans = trans->transaction;
433 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 434 int updates;
431 435
432 smp_mb(); 436 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 437 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 438 return 1;
435 439
440 /*
441 * We need to do this in case we're deleting csums so the global block
442 * rsv get's used instead of the csum block rsv.
443 */
444 trans->block_rsv = NULL;
445
436 updates = trans->delayed_ref_updates; 446 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 447 trans->delayed_ref_updates = 0;
438 if (updates) 448 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 449 btrfs_run_delayed_refs(trans, root, updates);
440 450
451 trans->block_rsv = rsv;
452
441 return should_end_transaction(trans, root); 453 return should_end_transaction(trans, root);
442} 454}
443 455
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 465 return 0;
454 } 466 }
455 467
468 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL;
456 while (count < 4) { 470 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 471 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 472 trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 487 count++;
474 } 488 }
475 489
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 490 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 491 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 492 trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 574int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 575 struct extent_io_tree *dirty_pages, int mark)
564{ 576{
565 int ret;
566 int err = 0; 577 int err = 0;
567 int werr = 0; 578 int werr = 0;
568 struct page *page; 579 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 580 u64 start = 0;
571 u64 end; 581 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 582
595 if (PageWriteback(page)) { 583 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 584 mark)) {
597 wait_on_page_writeback(page); 585 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 586 GFP_NOFS);
599 unlock_page(page); 587 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 588 if (err)
601 continue; 589 werr = err;
602 } 590 cond_resched();
603 } 591 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 592 }
610 if (err) 593 if (err)
611 werr = err; 594 werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 604int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 605 struct extent_io_tree *dirty_pages, int mark)
623{ 606{
624 int ret;
625 int err = 0; 607 int err = 0;
626 int werr = 0; 608 int werr = 0;
627 struct page *page; 609 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 610 u64 start = 0;
630 u64 end; 611 u64 end;
631 unsigned long index;
632
633 while (1) {
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
635 mark);
636 if (ret)
637 break;
638 612
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 613 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
640 while (start <= end) { 614 EXTENT_NEED_WAIT)) {
641 index = start >> PAGE_CACHE_SHIFT; 615 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT; 616 err = filemap_fdatawait_range(mapping, start, end);
643 page = find_get_page(btree_inode->i_mapping, index); 617 if (err)
644 if (!page) 618 werr = err;
645 continue; 619 cond_resched();
646 if (PageDirty(page)) { 620 start = end + 1;
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 621 }
658 if (err) 622 if (err)
659 werr = err; 623 werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 637
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 638 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 639 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 640
641 if (ret)
642 return ret;
643 if (ret2)
644 return ret2;
645 return 0;
677} 646}
678 647
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 648int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -911,10 +880,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 880 }
912 881
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 882 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 883
916 if (to_reserve > 0) { 884 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 885 ret = btrfs_block_rsv_add(root, &pending->block_rsv,
918 to_reserve); 886 to_reserve);
919 if (ret) { 887 if (ret) {
920 pending->error = ret; 888 pending->error = ret;
@@ -1002,7 +970,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 970 BUG_ON(IS_ERR(pending->snap));
1003 971
1004 btrfs_reloc_post_snapshot(trans, pending); 972 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 973fail:
1007 kfree(new_root_item); 974 kfree(new_root_item);
1008 trans->block_rsv = rsv; 975 trans->block_rsv = rsv;
@@ -1032,7 +999,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 999 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 1000 struct btrfs_super_block *super;
1034 1001
1035 super = &root->fs_info->super_copy; 1002 super = root->fs_info->super_copy;
1036 1003
1037 root_item = &root->fs_info->chunk_root->root_item; 1004 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1005 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1010,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1010 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1011 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1012 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1013 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1014 super->cache_generation = root_item->generation;
1048} 1015}
1049 1016
@@ -1168,14 +1135,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1135
1169 btrfs_run_ordered_operations(root, 0); 1136 btrfs_run_ordered_operations(root, 0);
1170 1137
1138 btrfs_trans_release_metadata(trans, root);
1139 trans->block_rsv = NULL;
1140
1171 /* make a pass through all the delayed refs we have so far 1141 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1142 * any runnings procs may add more while we are here
1173 */ 1143 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1144 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1145 BUG_ON(ret);
1176 1146
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1147 cur_trans = trans->transaction;
1180 /* 1148 /*
1181 * set the flushing flag so procs in this transaction have to 1149 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1309,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1309 update_super_roots(root);
1342 1310
1343 if (!root->fs_info->log_root_recovering) { 1311 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1312 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1313 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1314 }
1347 1315
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1316 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1317 sizeof(*root->fs_info->super_copy));
1350 1318
1351 trans->transaction->blocked = 0; 1319 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1320 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 0618aa39740b..3568374d419d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..f8e2943101a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
366 } 366 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 367 INIT_LIST_HEAD(&device->dev_alloc_list);
368 368
369 /* init readahead state */
370 spin_lock_init(&device->reada_lock);
371 device->reada_curr_zone = NULL;
372 atomic_set(&device->reada_in_flight, 0);
373 device->reada_next = 0;
374 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
375 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
376
369 mutex_lock(&fs_devices->device_list_mutex); 377 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 378 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 379 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 605 set_blocksize(bdev, 4096);
598 606
599 bh = btrfs_read_dev_super(bdev); 607 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 608 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 609 goto error_close;
603 }
604 610
605 disk_super = (struct btrfs_super_block *)bh->b_data; 611 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 612 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
655 continue; 661 continue;
656 } 662 }
657 if (fs_devices->open_devices == 0) { 663 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 664 ret = -EINVAL;
659 goto out; 665 goto out;
660 } 666 }
661 fs_devices->seeding = seeding; 667 fs_devices->seeding = seeding;
@@ -1013,8 +1019,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1019 }
1014 BUG_ON(ret); 1020 BUG_ON(ret);
1015 1021
1016 if (device->bytes_used > 0) 1022 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1023 u64 len = btrfs_dev_extent_length(leaf, extent);
1024 device->bytes_used -= len;
1025 spin_lock(&root->fs_info->free_chunk_lock);
1026 root->fs_info->free_chunk_space += len;
1027 spin_unlock(&root->fs_info->free_chunk_lock);
1028 }
1018 ret = btrfs_del_item(trans, root, path); 1029 ret = btrfs_del_item(trans, root, path);
1019 1030
1020out: 1031out:
@@ -1356,6 +1367,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1367 if (ret)
1357 goto error_undo; 1368 goto error_undo;
1358 1369
1370 spin_lock(&root->fs_info->free_chunk_lock);
1371 root->fs_info->free_chunk_space = device->total_bytes -
1372 device->bytes_used;
1373 spin_unlock(&root->fs_info->free_chunk_lock);
1374
1359 device->in_fs_metadata = 0; 1375 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1376 btrfs_scrub_cancel_dev(root, device);
1361 1377
@@ -1387,8 +1403,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1403 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1404 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1405
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1406 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1407 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1408
1393 if (cur_devices->open_devices == 0) { 1409 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1410 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1466,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1466 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1467 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1468 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1469 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1470 struct btrfs_device *device;
1455 u64 super_flags; 1471 u64 super_flags;
1456 1472
@@ -1691,15 +1707,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1707 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1708 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1709
1710 spin_lock(&root->fs_info->free_chunk_lock);
1711 root->fs_info->free_chunk_space += device->total_bytes;
1712 spin_unlock(&root->fs_info->free_chunk_lock);
1713
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1714 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1715 root->fs_info->fs_devices->rotating = 1;
1696 1716
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1717 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1718 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1719 total_bytes + device->total_bytes);
1700 1720
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1721 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1722 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1723 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1724 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1725
@@ -1790,7 +1810,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1810 struct btrfs_device *device, u64 new_size)
1791{ 1811{
1792 struct btrfs_super_block *super_copy = 1812 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1813 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1814 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1815 u64 diff = new_size - device->total_bytes;
1796 1816
@@ -1849,7 +1869,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1869static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1870 chunk_offset)
1851{ 1871{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1872 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1873 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1874 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1875 u8 *ptr;
@@ -2175,7 +2195,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2195 bool retried = false;
2176 struct extent_buffer *l; 2196 struct extent_buffer *l;
2177 struct btrfs_key key; 2197 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2198 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2199 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2200 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2201 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2212,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2212 lock_chunks(root);
2193 2213
2194 device->total_bytes = new_size; 2214 device->total_bytes = new_size;
2195 if (device->writeable) 2215 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2216 device->fs_devices->total_rw_bytes -= diff;
2217 spin_lock(&root->fs_info->free_chunk_lock);
2218 root->fs_info->free_chunk_space -= diff;
2219 spin_unlock(&root->fs_info->free_chunk_lock);
2220 }
2197 unlock_chunks(root); 2221 unlock_chunks(root);
2198 2222
2199again: 2223again:
@@ -2257,6 +2281,9 @@ again:
2257 device->total_bytes = old_size; 2281 device->total_bytes = old_size;
2258 if (device->writeable) 2282 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2283 device->fs_devices->total_rw_bytes += diff;
2284 spin_lock(&root->fs_info->free_chunk_lock);
2285 root->fs_info->free_chunk_space += diff;
2286 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2287 unlock_chunks(root);
2261 goto done; 2288 goto done;
2262 } 2289 }
@@ -2292,7 +2319,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2319 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2320 struct btrfs_chunk *chunk, int item_size)
2294{ 2321{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2322 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2323 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2324 u32 array_size;
2298 u8 *ptr; 2325 u8 *ptr;
@@ -2615,6 +2642,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2642 index++;
2616 } 2643 }
2617 2644
2645 spin_lock(&extent_root->fs_info->free_chunk_lock);
2646 extent_root->fs_info->free_chunk_space -= (stripe_size *
2647 map->num_stripes);
2648 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2649
2618 index = 0; 2650 index = 0;
2619 stripe = &chunk->stripe; 2651 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2652 while (index < map->num_stripes) {
@@ -2848,7 +2880,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2848 2880
2849static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2881static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 u64 logical, u64 *length, 2882 u64 logical, u64 *length,
2851 struct btrfs_multi_bio **multi_ret, 2883 struct btrfs_bio **bbio_ret,
2852 int mirror_num) 2884 int mirror_num)
2853{ 2885{
2854 struct extent_map *em; 2886 struct extent_map *em;
@@ -2866,18 +2898,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2866 int i; 2898 int i;
2867 int num_stripes; 2899 int num_stripes;
2868 int max_errors = 0; 2900 int max_errors = 0;
2869 struct btrfs_multi_bio *multi = NULL; 2901 struct btrfs_bio *bbio = NULL;
2870 2902
2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2903 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 stripes_allocated = 1; 2904 stripes_allocated = 1;
2873again: 2905again:
2874 if (multi_ret) { 2906 if (bbio_ret) {
2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2907 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2876 GFP_NOFS); 2908 GFP_NOFS);
2877 if (!multi) 2909 if (!bbio)
2878 return -ENOMEM; 2910 return -ENOMEM;
2879 2911
2880 atomic_set(&multi->error, 0); 2912 atomic_set(&bbio->error, 0);
2881 } 2913 }
2882 2914
2883 read_lock(&em_tree->lock); 2915 read_lock(&em_tree->lock);
@@ -2898,7 +2930,7 @@ again:
2898 if (mirror_num > map->num_stripes) 2930 if (mirror_num > map->num_stripes)
2899 mirror_num = 0; 2931 mirror_num = 0;
2900 2932
2901 /* if our multi bio struct is too small, back off and try again */ 2933 /* if our btrfs_bio struct is too small, back off and try again */
2902 if (rw & REQ_WRITE) { 2934 if (rw & REQ_WRITE) {
2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2935 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 BTRFS_BLOCK_GROUP_DUP)) { 2936 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2949,11 @@ again:
2917 stripes_required = map->num_stripes; 2949 stripes_required = map->num_stripes;
2918 } 2950 }
2919 } 2951 }
2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2952 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 stripes_allocated < stripes_required) { 2953 stripes_allocated < stripes_required) {
2922 stripes_allocated = map->num_stripes; 2954 stripes_allocated = map->num_stripes;
2923 free_extent_map(em); 2955 free_extent_map(em);
2924 kfree(multi); 2956 kfree(bbio);
2925 goto again; 2957 goto again;
2926 } 2958 }
2927 stripe_nr = offset; 2959 stripe_nr = offset;
@@ -2950,7 +2982,7 @@ again:
2950 *length = em->len - offset; 2982 *length = em->len - offset;
2951 } 2983 }
2952 2984
2953 if (!multi_ret) 2985 if (!bbio_ret)
2954 goto out; 2986 goto out;
2955 2987
2956 num_stripes = 1; 2988 num_stripes = 1;
@@ -2975,13 +3007,17 @@ again:
2975 stripe_index = find_live_mirror(map, 0, 3007 stripe_index = find_live_mirror(map, 0,
2976 map->num_stripes, 3008 map->num_stripes,
2977 current->pid % map->num_stripes); 3009 current->pid % map->num_stripes);
3010 mirror_num = stripe_index + 1;
2978 } 3011 }
2979 3012
2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3013 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 3014 if (rw & (REQ_WRITE | REQ_DISCARD)) {
2982 num_stripes = map->num_stripes; 3015 num_stripes = map->num_stripes;
2983 else if (mirror_num) 3016 } else if (mirror_num) {
2984 stripe_index = mirror_num - 1; 3017 stripe_index = mirror_num - 1;
3018 } else {
3019 mirror_num = 1;
3020 }
2985 3021
2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3022 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 int factor = map->num_stripes / map->sub_stripes; 3023 int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3037,7 @@ again:
3001 stripe_index = find_live_mirror(map, stripe_index, 3037 stripe_index = find_live_mirror(map, stripe_index,
3002 map->sub_stripes, stripe_index + 3038 map->sub_stripes, stripe_index +
3003 current->pid % map->sub_stripes); 3039 current->pid % map->sub_stripes);
3040 mirror_num = stripe_index + 1;
3004 } 3041 }
3005 } else { 3042 } else {
3006 /* 3043 /*
@@ -3009,15 +3046,16 @@ again:
3009 * stripe_index is the number of our device in the stripe array 3046 * stripe_index is the number of our device in the stripe array
3010 */ 3047 */
3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3048 stripe_index = do_div(stripe_nr, map->num_stripes);
3049 mirror_num = stripe_index + 1;
3012 } 3050 }
3013 BUG_ON(stripe_index >= map->num_stripes); 3051 BUG_ON(stripe_index >= map->num_stripes);
3014 3052
3015 if (rw & REQ_DISCARD) { 3053 if (rw & REQ_DISCARD) {
3016 for (i = 0; i < num_stripes; i++) { 3054 for (i = 0; i < num_stripes; i++) {
3017 multi->stripes[i].physical = 3055 bbio->stripes[i].physical =
3018 map->stripes[stripe_index].physical + 3056 map->stripes[stripe_index].physical +
3019 stripe_offset + stripe_nr * map->stripe_len; 3057 stripe_offset + stripe_nr * map->stripe_len;
3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3058 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3021 3059
3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3060 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 u64 stripes; 3061 u64 stripes;
@@ -3038,16 +3076,16 @@ again:
3038 } 3076 }
3039 stripes = stripe_nr_end - 1 - j; 3077 stripes = stripe_nr_end - 1 - j;
3040 do_div(stripes, map->num_stripes); 3078 do_div(stripes, map->num_stripes);
3041 multi->stripes[i].length = map->stripe_len * 3079 bbio->stripes[i].length = map->stripe_len *
3042 (stripes - stripe_nr + 1); 3080 (stripes - stripe_nr + 1);
3043 3081
3044 if (i == 0) { 3082 if (i == 0) {
3045 multi->stripes[i].length -= 3083 bbio->stripes[i].length -=
3046 stripe_offset; 3084 stripe_offset;
3047 stripe_offset = 0; 3085 stripe_offset = 0;
3048 } 3086 }
3049 if (stripe_index == last_stripe) 3087 if (stripe_index == last_stripe)
3050 multi->stripes[i].length -= 3088 bbio->stripes[i].length -=
3051 stripe_end_offset; 3089 stripe_end_offset;
3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3090 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 u64 stripes; 3091 u64 stripes;
@@ -3072,11 +3110,11 @@ again:
3072 } 3110 }
3073 stripes = stripe_nr_end - 1 - j; 3111 stripes = stripe_nr_end - 1 - j;
3074 do_div(stripes, factor); 3112 do_div(stripes, factor);
3075 multi->stripes[i].length = map->stripe_len * 3113 bbio->stripes[i].length = map->stripe_len *
3076 (stripes - stripe_nr + 1); 3114 (stripes - stripe_nr + 1);
3077 3115
3078 if (i < map->sub_stripes) { 3116 if (i < map->sub_stripes) {
3079 multi->stripes[i].length -= 3117 bbio->stripes[i].length -=
3080 stripe_offset; 3118 stripe_offset;
3081 if (i == map->sub_stripes - 1) 3119 if (i == map->sub_stripes - 1)
3082 stripe_offset = 0; 3120 stripe_offset = 0;
@@ -3084,11 +3122,11 @@ again:
3084 if (stripe_index >= last_stripe && 3122 if (stripe_index >= last_stripe &&
3085 stripe_index <= (last_stripe + 3123 stripe_index <= (last_stripe +
3086 map->sub_stripes - 1)) { 3124 map->sub_stripes - 1)) {
3087 multi->stripes[i].length -= 3125 bbio->stripes[i].length -=
3088 stripe_end_offset; 3126 stripe_end_offset;
3089 } 3127 }
3090 } else 3128 } else
3091 multi->stripes[i].length = *length; 3129 bbio->stripes[i].length = *length;
3092 3130
3093 stripe_index++; 3131 stripe_index++;
3094 if (stripe_index == map->num_stripes) { 3132 if (stripe_index == map->num_stripes) {
@@ -3099,19 +3137,20 @@ again:
3099 } 3137 }
3100 } else { 3138 } else {
3101 for (i = 0; i < num_stripes; i++) { 3139 for (i = 0; i < num_stripes; i++) {
3102 multi->stripes[i].physical = 3140 bbio->stripes[i].physical =
3103 map->stripes[stripe_index].physical + 3141 map->stripes[stripe_index].physical +
3104 stripe_offset + 3142 stripe_offset +
3105 stripe_nr * map->stripe_len; 3143 stripe_nr * map->stripe_len;
3106 multi->stripes[i].dev = 3144 bbio->stripes[i].dev =
3107 map->stripes[stripe_index].dev; 3145 map->stripes[stripe_index].dev;
3108 stripe_index++; 3146 stripe_index++;
3109 } 3147 }
3110 } 3148 }
3111 if (multi_ret) { 3149 if (bbio_ret) {
3112 *multi_ret = multi; 3150 *bbio_ret = bbio;
3113 multi->num_stripes = num_stripes; 3151 bbio->num_stripes = num_stripes;
3114 multi->max_errors = max_errors; 3152 bbio->max_errors = max_errors;
3153 bbio->mirror_num = mirror_num;
3115 } 3154 }
3116out: 3155out:
3117 free_extent_map(em); 3156 free_extent_map(em);
@@ -3120,9 +3159,9 @@ out:
3120 3159
3121int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3160int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 u64 logical, u64 *length, 3161 u64 logical, u64 *length,
3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3162 struct btrfs_bio **bbio_ret, int mirror_num)
3124{ 3163{
3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3164 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3126 mirror_num); 3165 mirror_num);
3127} 3166}
3128 3167
@@ -3191,28 +3230,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3191 return 0; 3230 return 0;
3192} 3231}
3193 3232
3194static void end_bio_multi_stripe(struct bio *bio, int err) 3233static void btrfs_end_bio(struct bio *bio, int err)
3195{ 3234{
3196 struct btrfs_multi_bio *multi = bio->bi_private; 3235 struct btrfs_bio *bbio = bio->bi_private;
3197 int is_orig_bio = 0; 3236 int is_orig_bio = 0;
3198 3237
3199 if (err) 3238 if (err)
3200 atomic_inc(&multi->error); 3239 atomic_inc(&bbio->error);
3201 3240
3202 if (bio == multi->orig_bio) 3241 if (bio == bbio->orig_bio)
3203 is_orig_bio = 1; 3242 is_orig_bio = 1;
3204 3243
3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3244 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3206 if (!is_orig_bio) { 3245 if (!is_orig_bio) {
3207 bio_put(bio); 3246 bio_put(bio);
3208 bio = multi->orig_bio; 3247 bio = bbio->orig_bio;
3209 } 3248 }
3210 bio->bi_private = multi->private; 3249 bio->bi_private = bbio->private;
3211 bio->bi_end_io = multi->end_io; 3250 bio->bi_end_io = bbio->end_io;
3251 bio->bi_bdev = (struct block_device *)
3252 (unsigned long)bbio->mirror_num;
3212 /* only send an error to the higher layers if it is 3253 /* only send an error to the higher layers if it is
3213 * beyond the tolerance of the multi-bio 3254 * beyond the tolerance of the multi-bio
3214 */ 3255 */
3215 if (atomic_read(&multi->error) > multi->max_errors) { 3256 if (atomic_read(&bbio->error) > bbio->max_errors) {
3216 err = -EIO; 3257 err = -EIO;
3217 } else if (err) { 3258 } else if (err) {
3218 /* 3259 /*
@@ -3222,7 +3263,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3263 set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 err = 0; 3264 err = 0;
3224 } 3265 }
3225 kfree(multi); 3266 kfree(bbio);
3226 3267
3227 bio_endio(bio, err); 3268 bio_endio(bio, err);
3228 } else if (!is_orig_bio) { 3269 } else if (!is_orig_bio) {
@@ -3302,20 +3343,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3302 u64 logical = (u64)bio->bi_sector << 9; 3343 u64 logical = (u64)bio->bi_sector << 9;
3303 u64 length = 0; 3344 u64 length = 0;
3304 u64 map_length; 3345 u64 map_length;
3305 struct btrfs_multi_bio *multi = NULL;
3306 int ret; 3346 int ret;
3307 int dev_nr = 0; 3347 int dev_nr = 0;
3308 int total_devs = 1; 3348 int total_devs = 1;
3349 struct btrfs_bio *bbio = NULL;
3309 3350
3310 length = bio->bi_size; 3351 length = bio->bi_size;
3311 map_tree = &root->fs_info->mapping_tree; 3352 map_tree = &root->fs_info->mapping_tree;
3312 map_length = length; 3353 map_length = length;
3313 3354
3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3355 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3315 mirror_num); 3356 mirror_num);
3316 BUG_ON(ret); 3357 BUG_ON(ret);
3317 3358
3318 total_devs = multi->num_stripes; 3359 total_devs = bbio->num_stripes;
3319 if (map_length < length) { 3360 if (map_length < length) {
3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3361 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 "len %llu\n", (unsigned long long)logical, 3362 "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3364,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3323 (unsigned long long)map_length); 3364 (unsigned long long)map_length);
3324 BUG(); 3365 BUG();
3325 } 3366 }
3326 multi->end_io = first_bio->bi_end_io; 3367
3327 multi->private = first_bio->bi_private; 3368 bbio->orig_bio = first_bio;
3328 multi->orig_bio = first_bio; 3369 bbio->private = first_bio->bi_private;
3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3370 bbio->end_io = first_bio->bi_end_io;
3371 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3330 3372
3331 while (dev_nr < total_devs) { 3373 while (dev_nr < total_devs) {
3332 if (total_devs > 1) { 3374 if (dev_nr < total_devs - 1) {
3333 if (dev_nr < total_devs - 1) { 3375 bio = bio_clone(first_bio, GFP_NOFS);
3334 bio = bio_clone(first_bio, GFP_NOFS); 3376 BUG_ON(!bio);
3335 BUG_ON(!bio); 3377 } else {
3336 } else { 3378 bio = first_bio;
3337 bio = first_bio;
3338 }
3339 bio->bi_private = multi;
3340 bio->bi_end_io = end_bio_multi_stripe;
3341 } 3379 }
3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3380 bio->bi_private = bbio;
3343 dev = multi->stripes[dev_nr].dev; 3381 bio->bi_end_io = btrfs_end_bio;
3382 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3383 dev = bbio->stripes[dev_nr].dev;
3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3384 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3385 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3386 "(%s id %llu), size=%u\n", rw,
3387 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3388 dev->name, dev->devid, bio->bi_size);
3345 bio->bi_bdev = dev->bdev; 3389 bio->bi_bdev = dev->bdev;
3346 if (async_submit) 3390 if (async_submit)
3347 schedule_bio(root, dev, rw, bio); 3391 schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3398,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3354 } 3398 }
3355 dev_nr++; 3399 dev_nr++;
3356 } 3400 }
3357 if (total_devs == 1)
3358 kfree(multi);
3359 return 0; 3401 return 0;
3360} 3402}
3361 3403
@@ -3616,15 +3658,20 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3658 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3659 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3660 device->in_fs_metadata = 1;
3619 if (device->writeable) 3661 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3662 device->fs_devices->total_rw_bytes += device->total_bytes;
3663 spin_lock(&root->fs_info->free_chunk_lock);
3664 root->fs_info->free_chunk_space += device->total_bytes -
3665 device->bytes_used;
3666 spin_unlock(&root->fs_info->free_chunk_lock);
3667 }
3621 ret = 0; 3668 ret = 0;
3622 return ret; 3669 return ret;
3623} 3670}
3624 3671
3625int btrfs_read_sys_array(struct btrfs_root *root) 3672int btrfs_read_sys_array(struct btrfs_root *root)
3626{ 3673{
3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3674 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3628 struct extent_buffer *sb; 3675 struct extent_buffer *sb;
3629 struct btrfs_disk_key *disk_key; 3676 struct btrfs_disk_key *disk_key;
3630 struct btrfs_chunk *chunk; 3677 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..ab5b1c49f352 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
95}; 103};
96 104
97struct btrfs_fs_devices { 105struct btrfs_fs_devices {
@@ -136,7 +144,10 @@ struct btrfs_bio_stripe {
136 u64 length; /* only used for discard mappings */ 144 u64 length; /* only used for discard mappings */
137}; 145};
138 146
139struct btrfs_multi_bio { 147struct btrfs_bio;
148typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
149
150struct btrfs_bio {
140 atomic_t stripes_pending; 151 atomic_t stripes_pending;
141 bio_end_io_t *end_io; 152 bio_end_io_t *end_io;
142 struct bio *orig_bio; 153 struct bio *orig_bio;
@@ -144,6 +155,7 @@ struct btrfs_multi_bio {
144 atomic_t error; 155 atomic_t error;
145 int max_errors; 156 int max_errors;
146 int num_stripes; 157 int num_stripes;
158 int mirror_num;
147 struct btrfs_bio_stripe stripes[]; 159 struct btrfs_bio_stripe stripes[];
148}; 160};
149 161
@@ -171,7 +183,7 @@ struct map_lookup {
171int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 183int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
172 u64 end, u64 *length); 184 u64 end, u64 *length);
173 185
174#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 186#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
175 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
176 188
177int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 189int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +192,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
180 u64 chunk_offset, u64 start, u64 num_bytes); 192 u64 chunk_offset, u64 start, u64 num_bytes);
181int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 193int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
182 u64 logical, u64 *length, 194 u64 logical, u64 *length,
183 struct btrfs_multi_bio **multi_ret, int mirror_num); 195 struct btrfs_bio **bbio_ret, int mirror_num);
184int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 196int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
185 u64 chunk_start, u64 physical, u64 devid, 197 u64 chunk_start, u64 physical, u64 devid,
186 u64 **logical, int *naddrs, int *stripe_len); 198 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1af..3848b04e310e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;