aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Makefile3
-rw-r--r--fs/btrfs/acl.c17
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/btrfs_inode.h21
-rw-r--r--fs/btrfs/compression.c3
-rw-r--r--fs/btrfs/ctree.c27
-rw-r--r--fs/btrfs/ctree.h206
-rw-r--r--fs/btrfs/delayed-inode.c110
-rw-r--r--fs/btrfs/disk-io.c632
-rw-r--r--fs/btrfs/disk-io.h4
-rw-r--r--fs/btrfs/extent-tree.c1018
-rw-r--r--fs/btrfs/extent_io.c632
-rw-r--r--fs/btrfs/extent_io.h23
-rw-r--r--fs/btrfs/file-item.c17
-rw-r--r--fs/btrfs/file.c27
-rw-r--r--fs/btrfs/free-space-cache.c994
-rw-r--r--fs/btrfs/inode-map.c34
-rw-r--r--fs/btrfs/inode.c549
-rw-r--r--fs/btrfs/ioctl.c238
-rw-r--r--fs/btrfs/ioctl.h29
-rw-r--r--fs/btrfs/print-tree.c8
-rw-r--r--fs/btrfs/reada.c951
-rw-r--r--fs/btrfs/relocation.c26
-rw-r--r--fs/btrfs/scrub.c660
-rw-r--r--fs/btrfs/super.c315
-rw-r--r--fs/btrfs/transaction.c156
-rw-r--r--fs/btrfs/tree-log.c21
-rw-r--r--fs/btrfs/volumes.c212
-rw-r--r--fs/btrfs/volumes.h24
-rw-r--r--fs/btrfs/xattr.c11
31 files changed, 5966 insertions, 1840 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ 7 extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o backref.o
11 12
12btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
59 if (!value) 59 if (!value)
60 return ERR_PTR(-ENOMEM); 60 return ERR_PTR(-ENOMEM);
61 size = __btrfs_getxattr(inode, name, value, size); 61 size = __btrfs_getxattr(inode, name, value, size);
62 if (size > 0) { 62 }
63 acl = posix_acl_from_xattr(value, size); 63 if (size > 0) {
64 if (IS_ERR(acl)) { 64 acl = posix_acl_from_xattr(value, size);
65 kfree(value);
66 return acl;
67 }
68 set_cached_acl(inode, type, acl);
69 }
70 kfree(value);
71 } else if (size == -ENOENT || size == -ENODATA || size == 0) { 65 } else if (size == -ENOENT || size == -ENODATA || size == 0) {
72 /* FIXME, who returns -ENOENT? I think nobody */ 66 /* FIXME, who returns -ENOENT? I think nobody */
73 acl = NULL; 67 acl = NULL;
74 set_cached_acl(inode, type, acl);
75 } else { 68 } else {
76 acl = ERR_PTR(-EIO); 69 acl = ERR_PTR(-EIO);
77 } 70 }
71 kfree(value);
72
73 if (!IS_ERR(acl))
74 set_cached_acl(inode, type, acl);
78 75
79 return acl; 76 return acl;
80} 77}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..22c64fff1bd5
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->val[i] = (u64)(unsigned long)fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->val[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..634608d2a6d0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
103 */ 103 */
104 u64 delalloc_bytes; 104 u64 delalloc_bytes;
105 105
106 /* total number of bytes that may be used for this inode for
107 * delalloc
108 */
109 u64 reserved_bytes;
110
111 /* 106 /*
112 * the size of the file stored in the metadata on disk. data=ordered 107 * the size of the file stored in the metadata on disk. data=ordered
113 * means the in-memory i_size might be larger than the size on disk 108 * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
115 */ 110 */
116 u64 disk_i_size; 111 u64 disk_i_size;
117 112
118 /* flags field from the on disk inode */
119 u32 flags;
120
121 /* 113 /*
122 * if this is a directory then index_cnt is the counter for the index 114 * if this is a directory then index_cnt is the counter for the index
123 * number for new files that are created 115 * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
132 u64 last_unlink_trans; 124 u64 last_unlink_trans;
133 125
134 /* 126 /*
127 * Number of bytes outstanding that are going to need csums. This is
128 * used in ENOSPC accounting.
129 */
130 u64 csum_bytes;
131
132 /* flags field from the on disk inode */
133 u32 flags;
134
135 /*
135 * Counters to keep track of the number of extent item's we may use due 136 * Counters to keep track of the number of extent item's we may use due
136 * to delalloc and such. outstanding_extents is the number of extent 137 * to delalloc and such. outstanding_extents is the number of extent
137 * items we think we'll end up using, and reserved_extents is the number 138 * items we think we'll end up using, and reserved_extents is the number
@@ -146,14 +147,12 @@ struct btrfs_inode {
146 * the btrfs file release call will add this inode to the 147 * the btrfs file release call will add this inode to the
147 * ordered operations list so that we make sure to flush out any 148 * ordered operations list so that we make sure to flush out any
148 * new data the application may have written before commit. 149 * new data the application may have written before commit.
149 *
150 * yes, its silly to have a single bitflag, but we might grow more
151 * of these.
152 */ 150 */
153 unsigned ordered_data_close:1; 151 unsigned ordered_data_close:1;
154 unsigned orphan_meta_reserved:1; 152 unsigned orphan_meta_reserved:1;
155 unsigned dummy_inode:1; 153 unsigned dummy_inode:1;
156 unsigned in_defrag:1; 154 unsigned in_defrag:1;
155 unsigned delalloc_meta_reserved:1;
157 156
158 /* 157 /*
159 * always compress this one file 158 * always compress this one file
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
85static inline int compressed_bio_size(struct btrfs_root *root, 85static inline int compressed_bio_size(struct btrfs_root *root,
86 unsigned long disk_size) 86 unsigned long disk_size)
87{ 87{
88 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 88 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
89
89 return sizeof(struct compressed_bio) + 90 return sizeof(struct compressed_bio) +
90 ((disk_size + root->sectorsize - 1) / root->sectorsize) * 91 ((disk_size + root->sectorsize - 1) / root->sectorsize) *
91 csum_size; 92 csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..dede441bdeee 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -514,10 +514,25 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans,
514 struct btrfs_root *root, 514 struct btrfs_root *root,
515 struct extent_buffer *buf) 515 struct extent_buffer *buf)
516{ 516{
517 /* ensure we can see the force_cow */
518 smp_rmb();
519
520 /*
521 * We do not need to cow a block if
522 * 1) this block is not created or changed in this transaction;
523 * 2) this block does not belong to TREE_RELOC tree;
524 * 3) the root is not forced COW.
525 *
526 * What is forced COW:
527 * when we create snapshot during commiting the transaction,
528 * after we've finished coping src root, we must COW the shared
529 * block to ensure the metadata consistency.
530 */
517 if (btrfs_header_generation(buf) == trans->transid && 531 if (btrfs_header_generation(buf) == trans->transid &&
518 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) && 532 !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
519 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && 533 !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
520 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) 534 btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
535 !root->force_cow)
521 return 0; 536 return 0;
522 return 1; 537 return 1;
523} 538}
@@ -902,9 +917,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
902 917
903 orig_ptr = btrfs_node_blockptr(mid, orig_slot); 918 orig_ptr = btrfs_node_blockptr(mid, orig_slot);
904 919
905 if (level < BTRFS_MAX_LEVEL - 1) 920 if (level < BTRFS_MAX_LEVEL - 1) {
906 parent = path->nodes[level + 1]; 921 parent = path->nodes[level + 1];
907 pslot = path->slots[level + 1]; 922 pslot = path->slots[level + 1];
923 }
908 924
909 /* 925 /*
910 * deal with the case where there is only one pointer in the root 926 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1123,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1107 mid = path->nodes[level]; 1123 mid = path->nodes[level];
1108 WARN_ON(btrfs_header_generation(mid) != trans->transid); 1124 WARN_ON(btrfs_header_generation(mid) != trans->transid);
1109 1125
1110 if (level < BTRFS_MAX_LEVEL - 1) 1126 if (level < BTRFS_MAX_LEVEL - 1) {
1111 parent = path->nodes[level + 1]; 1127 parent = path->nodes[level + 1];
1112 pslot = path->slots[level + 1]; 1128 pslot = path->slots[level + 1];
1129 }
1113 1130
1114 if (!parent) 1131 if (!parent)
1115 return 1; 1132 return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..50634abef9b4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
30#include <linux/kobject.h> 30#include <linux/kobject.h>
31#include <trace/events/btrfs.h> 31#include <trace/events/btrfs.h>
32#include <asm/kmap_types.h> 32#include <asm/kmap_types.h>
33#include <linux/pagemap.h>
33#include "extent_io.h" 34#include "extent_io.h"
34#include "extent_map.h" 35#include "extent_map.h"
35#include "async-thread.h" 36#include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
360#define BTRFS_LABEL_SIZE 256 361#define BTRFS_LABEL_SIZE 256
361 362
362/* 363/*
364 * just in case we somehow lose the roots and are not able to mount,
365 * we store an array of the roots from previous transactions
366 * in the super.
367 */
368#define BTRFS_NUM_BACKUP_ROOTS 4
369struct btrfs_root_backup {
370 __le64 tree_root;
371 __le64 tree_root_gen;
372
373 __le64 chunk_root;
374 __le64 chunk_root_gen;
375
376 __le64 extent_root;
377 __le64 extent_root_gen;
378
379 __le64 fs_root;
380 __le64 fs_root_gen;
381
382 __le64 dev_root;
383 __le64 dev_root_gen;
384
385 __le64 csum_root;
386 __le64 csum_root_gen;
387
388 __le64 total_bytes;
389 __le64 bytes_used;
390 __le64 num_devices;
391 /* future */
392 __le64 unsed_64[4];
393
394 u8 tree_root_level;
395 u8 chunk_root_level;
396 u8 extent_root_level;
397 u8 fs_root_level;
398 u8 dev_root_level;
399 u8 csum_root_level;
400 /* future and to align */
401 u8 unused_8[10];
402} __attribute__ ((__packed__));
403
404/*
363 * the super block basically lists the main trees of the FS 405 * the super block basically lists the main trees of the FS
364 * it currently lacks any block count etc etc 406 * it currently lacks any block count etc etc
365 */ 407 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
405 /* future expansion */ 447 /* future expansion */
406 __le64 reserved[31]; 448 __le64 reserved[31];
407 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; 449 u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
450 struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
408} __attribute__ ((__packed__)); 451} __attribute__ ((__packed__));
409 452
410/* 453/*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
772struct btrfs_block_rsv { 815struct btrfs_block_rsv {
773 u64 size; 816 u64 size;
774 u64 reserved; 817 u64 reserved;
775 u64 freed[2];
776 struct btrfs_space_info *space_info; 818 struct btrfs_space_info *space_info;
777 struct list_head list;
778 spinlock_t lock; 819 spinlock_t lock;
779 atomic_t usage;
780 unsigned int priority:8;
781 unsigned int durable:1;
782 unsigned int refill_used:1;
783 unsigned int full:1; 820 unsigned int full:1;
784}; 821};
785 822
@@ -811,7 +848,8 @@ struct btrfs_free_cluster {
811enum btrfs_caching_type { 848enum btrfs_caching_type {
812 BTRFS_CACHE_NO = 0, 849 BTRFS_CACHE_NO = 0,
813 BTRFS_CACHE_STARTED = 1, 850 BTRFS_CACHE_STARTED = 1,
814 BTRFS_CACHE_FINISHED = 2, 851 BTRFS_CACHE_FAST = 2,
852 BTRFS_CACHE_FINISHED = 3,
815}; 853};
816 854
817enum btrfs_disk_cache_state { 855enum btrfs_disk_cache_state {
@@ -840,10 +878,10 @@ struct btrfs_block_group_cache {
840 spinlock_t lock; 878 spinlock_t lock;
841 u64 pinned; 879 u64 pinned;
842 u64 reserved; 880 u64 reserved;
843 u64 reserved_pinned;
844 u64 bytes_super; 881 u64 bytes_super;
845 u64 flags; 882 u64 flags;
846 u64 sectorsize; 883 u64 sectorsize;
884 u64 cache_generation;
847 unsigned int ro:1; 885 unsigned int ro:1;
848 unsigned int dirty:1; 886 unsigned int dirty:1;
849 unsigned int iref:1; 887 unsigned int iref:1;
@@ -899,6 +937,10 @@ struct btrfs_fs_info {
899 spinlock_t block_group_cache_lock; 937 spinlock_t block_group_cache_lock;
900 struct rb_root block_group_cache_tree; 938 struct rb_root block_group_cache_tree;
901 939
940 /* keep track of unallocated space */
941 spinlock_t free_chunk_lock;
942 u64 free_chunk_space;
943
902 struct extent_io_tree freed_extents[2]; 944 struct extent_io_tree freed_extents[2];
903 struct extent_io_tree *pinned_extents; 945 struct extent_io_tree *pinned_extents;
904 946
@@ -916,14 +958,11 @@ struct btrfs_fs_info {
916 struct btrfs_block_rsv trans_block_rsv; 958 struct btrfs_block_rsv trans_block_rsv;
917 /* block reservation for chunk tree */ 959 /* block reservation for chunk tree */
918 struct btrfs_block_rsv chunk_block_rsv; 960 struct btrfs_block_rsv chunk_block_rsv;
961 /* block reservation for delayed operations */
962 struct btrfs_block_rsv delayed_block_rsv;
919 963
920 struct btrfs_block_rsv empty_block_rsv; 964 struct btrfs_block_rsv empty_block_rsv;
921 965
922 /* list of block reservations that cross multiple transactions */
923 struct list_head durable_block_rsv_list;
924
925 struct mutex durable_block_rsv_mutex;
926
927 u64 generation; 966 u64 generation;
928 u64 last_trans_committed; 967 u64 last_trans_committed;
929 968
@@ -942,8 +981,8 @@ struct btrfs_fs_info {
942 wait_queue_head_t transaction_blocked_wait; 981 wait_queue_head_t transaction_blocked_wait;
943 wait_queue_head_t async_submit_wait; 982 wait_queue_head_t async_submit_wait;
944 983
945 struct btrfs_super_block super_copy; 984 struct btrfs_super_block *super_copy;
946 struct btrfs_super_block super_for_commit; 985 struct btrfs_super_block *super_for_commit;
947 struct block_device *__bdev; 986 struct block_device *__bdev;
948 struct super_block *sb; 987 struct super_block *sb;
949 struct inode *btree_inode; 988 struct inode *btree_inode;
@@ -1036,6 +1075,7 @@ struct btrfs_fs_info {
1036 struct btrfs_workers endio_freespace_worker; 1075 struct btrfs_workers endio_freespace_worker;
1037 struct btrfs_workers submit_workers; 1076 struct btrfs_workers submit_workers;
1038 struct btrfs_workers caching_workers; 1077 struct btrfs_workers caching_workers;
1078 struct btrfs_workers readahead_workers;
1039 1079
1040 /* 1080 /*
1041 * fixup workers take dirty pages that didn't properly go through 1081 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1159,13 @@ struct btrfs_fs_info {
1119 u64 fs_state; 1159 u64 fs_state;
1120 1160
1121 struct btrfs_delayed_root *delayed_root; 1161 struct btrfs_delayed_root *delayed_root;
1162
1163 /* readahead tree */
1164 spinlock_t reada_lock;
1165 struct radix_tree_root reada_tree;
1166
1167 /* next backup root to be overwritten */
1168 int backup_root_index;
1122}; 1169};
1123 1170
1124/* 1171/*
@@ -1225,6 +1272,8 @@ struct btrfs_root {
1225 * for stat. It may be used for more later 1272 * for stat. It may be used for more later
1226 */ 1273 */
1227 dev_t anon_dev; 1274 dev_t anon_dev;
1275
1276 int force_cow;
1228}; 1277};
1229 1278
1230struct btrfs_ioctl_defrag_range_args { 1279struct btrfs_ioctl_defrag_range_args {
@@ -1363,6 +1412,7 @@ struct btrfs_ioctl_defrag_range_args {
1363#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15) 1412#define BTRFS_MOUNT_ENOSPC_DEBUG (1 << 15)
1364#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16) 1413#define BTRFS_MOUNT_AUTO_DEFRAG (1 << 16)
1365#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17) 1414#define BTRFS_MOUNT_INODE_MAP_CACHE (1 << 17)
1415#define BTRFS_MOUNT_RECOVERY (1 << 18)
1366 1416
1367#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) 1417#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
1368#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) 1418#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2028,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
1978 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY; 2028 return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
1979} 2029}
1980 2030
2031/* struct btrfs_root_backup */
2032BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
2033 tree_root, 64);
2034BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
2035 tree_root_gen, 64);
2036BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
2037 tree_root_level, 8);
2038
2039BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
2040 chunk_root, 64);
2041BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
2042 chunk_root_gen, 64);
2043BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
2044 chunk_root_level, 8);
2045
2046BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
2047 extent_root, 64);
2048BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
2049 extent_root_gen, 64);
2050BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
2051 extent_root_level, 8);
2052
2053BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
2054 fs_root, 64);
2055BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
2056 fs_root_gen, 64);
2057BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
2058 fs_root_level, 8);
2059
2060BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
2061 dev_root, 64);
2062BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
2063 dev_root_gen, 64);
2064BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
2065 dev_root_level, 8);
2066
2067BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
2068 csum_root, 64);
2069BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
2070 csum_root_gen, 64);
2071BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
2072 csum_root_level, 8);
2073BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
2074 total_bytes, 64);
2075BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
2076 bytes_used, 64);
2077BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
2078 num_devices, 64);
2079
1981/* struct btrfs_super_block */ 2080/* struct btrfs_super_block */
1982 2081
1983BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); 2082BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2228,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
2129 (space_info->flags & BTRFS_BLOCK_GROUP_DATA)); 2228 (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
2130} 2229}
2131 2230
2231static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
2232{
2233 return mapping_gfp_mask(mapping) & ~__GFP_FS;
2234}
2235
2132/* extent-tree.c */ 2236/* extent-tree.c */
2133static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, 2237static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2134 unsigned num_items) 2238 unsigned num_items)
@@ -2137,6 +2241,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
2137 3 * num_items; 2241 3 * num_items;
2138} 2242}
2139 2243
2244/*
2245 * Doing a truncate won't result in new nodes or leaves, just what we need for
2246 * COW.
2247 */
2248static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
2249 unsigned num_items)
2250{
2251 return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
2252 num_items;
2253}
2254
2140void btrfs_put_block_group(struct btrfs_block_group_cache *cache); 2255void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
2141int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, 2256int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2142 struct btrfs_root *root, unsigned long count); 2257 struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2261,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
2146 u64 num_bytes, u64 *refs, u64 *flags); 2261 u64 num_bytes, u64 *refs, u64 *flags);
2147int btrfs_pin_extent(struct btrfs_root *root, 2262int btrfs_pin_extent(struct btrfs_root *root,
2148 u64 bytenr, u64 num, int reserved); 2263 u64 bytenr, u64 num, int reserved);
2264int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
2265 struct btrfs_root *root,
2266 u64 bytenr, u64 num_bytes);
2149int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, 2267int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2150 struct btrfs_root *root, 2268 struct btrfs_root *root,
2151 u64 objectid, u64 offset, u64 bytenr); 2269 u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2314,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
2196 u64 root_objectid, u64 owner, u64 offset); 2314 u64 root_objectid, u64 owner, u64 offset);
2197 2315
2198int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); 2316int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
2199int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 2317int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
2200 u64 num_bytes, int reserve, int sinfo); 2318 u64 start, u64 len);
2201int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, 2319int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
2202 struct btrfs_root *root); 2320 struct btrfs_root *root);
2203int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, 2321int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2358,26 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
2240struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root); 2358struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
2241void btrfs_free_block_rsv(struct btrfs_root *root, 2359void btrfs_free_block_rsv(struct btrfs_root *root,
2242 struct btrfs_block_rsv *rsv); 2360 struct btrfs_block_rsv *rsv);
2243void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info, 2361int btrfs_block_rsv_add(struct btrfs_root *root,
2244 struct btrfs_block_rsv *rsv);
2245int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
2246 struct btrfs_root *root,
2247 struct btrfs_block_rsv *block_rsv, 2362 struct btrfs_block_rsv *block_rsv,
2248 u64 num_bytes); 2363 u64 num_bytes);
2249int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 2364int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
2250 struct btrfs_root *root, 2365 struct btrfs_block_rsv *block_rsv,
2366 u64 num_bytes);
2367int btrfs_block_rsv_check(struct btrfs_root *root,
2368 struct btrfs_block_rsv *block_rsv, int min_factor);
2369int btrfs_block_rsv_refill(struct btrfs_root *root,
2251 struct btrfs_block_rsv *block_rsv, 2370 struct btrfs_block_rsv *block_rsv,
2252 u64 min_reserved, int min_factor); 2371 u64 min_reserved);
2372int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
2373 struct btrfs_block_rsv *block_rsv,
2374 u64 min_reserved);
2253int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 2375int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
2254 struct btrfs_block_rsv *dst_rsv, 2376 struct btrfs_block_rsv *dst_rsv,
2255 u64 num_bytes); 2377 u64 num_bytes);
2256void btrfs_block_rsv_release(struct btrfs_root *root, 2378void btrfs_block_rsv_release(struct btrfs_root *root,
2257 struct btrfs_block_rsv *block_rsv, 2379 struct btrfs_block_rsv *block_rsv,
2258 u64 num_bytes); 2380 u64 num_bytes);
2259int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
2260 struct btrfs_root *root,
2261 struct btrfs_block_rsv *rsv);
2262int btrfs_set_block_group_ro(struct btrfs_root *root, 2381int btrfs_set_block_group_ro(struct btrfs_root *root,
2263 struct btrfs_block_group_cache *cache); 2382 struct btrfs_block_group_cache *cache);
2264int btrfs_set_block_group_rw(struct btrfs_root *root, 2383int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2498,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
2379 smp_mb(); 2498 smp_mb();
2380 return fs_info->closing; 2499 return fs_info->closing;
2381} 2500}
2501static inline void free_fs_info(struct btrfs_fs_info *fs_info)
2502{
2503 kfree(fs_info->delayed_root);
2504 kfree(fs_info->extent_root);
2505 kfree(fs_info->tree_root);
2506 kfree(fs_info->chunk_root);
2507 kfree(fs_info->dev_root);
2508 kfree(fs_info->csum_root);
2509 kfree(fs_info->super_copy);
2510 kfree(fs_info->super_for_commit);
2511 kfree(fs_info);
2512}
2382 2513
2383/* root-item.c */ 2514/* root-item.c */
2384int btrfs_find_root_ref(struct btrfs_root *tree_root, 2515int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2710,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
2579int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); 2710int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
2580int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); 2711int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
2581int btrfs_orphan_cleanup(struct btrfs_root *root); 2712int btrfs_orphan_cleanup(struct btrfs_root *root);
2582void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2583 struct btrfs_pending_snapshot *pending,
2584 u64 *bytes_to_reserve);
2585void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2586 struct btrfs_pending_snapshot *pending);
2587void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans, 2713void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2588 struct btrfs_root *root); 2714 struct btrfs_root *root);
2589int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size); 2715int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2823,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
2697int btrfs_scrub_progress(struct btrfs_root *root, u64 devid, 2823int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
2698 struct btrfs_scrub_progress *progress); 2824 struct btrfs_scrub_progress *progress);
2699 2825
2826/* reada.c */
2827struct reada_control {
2828 struct btrfs_root *root; /* tree to prefetch */
2829 struct btrfs_key key_start;
2830 struct btrfs_key key_end; /* exclusive */
2831 atomic_t elems;
2832 struct kref refcnt;
2833 wait_queue_head_t wait;
2834};
2835struct reada_control *btrfs_reada_add(struct btrfs_root *root,
2836 struct btrfs_key *start, struct btrfs_key *end);
2837int btrfs_reada_wait(void *handle);
2838void btrfs_reada_detach(void *handle);
2839int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
2840 u64 start, int err);
2841
2700#endif 2842#endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..5b163572e0ca 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
591 return 0; 591 return 0;
592 592
593 src_rsv = trans->block_rsv; 593 src_rsv = trans->block_rsv;
594 dst_rsv = &root->fs_info->global_block_rsv; 594 dst_rsv = &root->fs_info->delayed_block_rsv;
595 595
596 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 596 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 597 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
609 if (!item->bytes_reserved) 609 if (!item->bytes_reserved)
610 return; 610 return;
611 611
612 rsv = &root->fs_info->global_block_rsv; 612 rsv = &root->fs_info->delayed_block_rsv;
613 btrfs_block_rsv_release(root, rsv, 613 btrfs_block_rsv_release(root, rsv,
614 item->bytes_reserved); 614 item->bytes_reserved);
615} 615}
@@ -617,24 +617,102 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
617static int btrfs_delayed_inode_reserve_metadata( 617static int btrfs_delayed_inode_reserve_metadata(
618 struct btrfs_trans_handle *trans, 618 struct btrfs_trans_handle *trans,
619 struct btrfs_root *root, 619 struct btrfs_root *root,
620 struct inode *inode,
620 struct btrfs_delayed_node *node) 621 struct btrfs_delayed_node *node)
621{ 622{
622 struct btrfs_block_rsv *src_rsv; 623 struct btrfs_block_rsv *src_rsv;
623 struct btrfs_block_rsv *dst_rsv; 624 struct btrfs_block_rsv *dst_rsv;
624 u64 num_bytes; 625 u64 num_bytes;
625 int ret; 626 int ret;
626 627 int release = false;
627 if (!trans->bytes_reserved)
628 return 0;
629 628
630 src_rsv = trans->block_rsv; 629 src_rsv = trans->block_rsv;
631 dst_rsv = &root->fs_info->global_block_rsv; 630 dst_rsv = &root->fs_info->delayed_block_rsv;
632 631
633 num_bytes = btrfs_calc_trans_metadata_size(root, 1); 632 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
633
634 /*
635 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
636 * which doesn't reserve space for speed. This is a problem since we
637 * still need to reserve space for this update, so try to reserve the
638 * space.
639 *
640 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
641 * we're accounted for.
642 */
643 if (!trans->bytes_reserved &&
644 src_rsv != &root->fs_info->delalloc_block_rsv) {
645 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
646 /*
647 * Since we're under a transaction reserve_metadata_bytes could
648 * try to commit the transaction which will make it return
649 * EAGAIN to make us stop the transaction we have, so return
650 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
651 */
652 if (ret == -EAGAIN)
653 ret = -ENOSPC;
654 if (!ret)
655 node->bytes_reserved = num_bytes;
656 return ret;
657 } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
658 spin_lock(&BTRFS_I(inode)->lock);
659 if (BTRFS_I(inode)->delalloc_meta_reserved) {
660 BTRFS_I(inode)->delalloc_meta_reserved = 0;
661 spin_unlock(&BTRFS_I(inode)->lock);
662 release = true;
663 goto migrate;
664 }
665 spin_unlock(&BTRFS_I(inode)->lock);
666
667 /* Ok we didn't have space pre-reserved. This shouldn't happen
668 * too often but it can happen if we do delalloc to an existing
669 * inode which gets dirtied because of the time update, and then
670 * isn't touched again until after the transaction commits and
671 * then we try to write out the data. First try to be nice and
672 * reserve something strictly for us. If not be a pain and try
673 * to steal from the delalloc block rsv.
674 */
675 ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
676 if (!ret)
677 goto out;
678
679 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
680 if (!ret)
681 goto out;
682
683 /*
684 * Ok this is a problem, let's just steal from the global rsv
685 * since this really shouldn't happen that often.
686 */
687 WARN_ON(1);
688 ret = btrfs_block_rsv_migrate(&root->fs_info->global_block_rsv,
689 dst_rsv, num_bytes);
690 goto out;
691 }
692
693migrate:
634 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes); 694 ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
695
696out:
697 /*
698 * Migrate only takes a reservation, it doesn't touch the size of the
699 * block_rsv. This is to simplify people who don't normally have things
700 * migrated from their block rsv. If they go to release their
701 * reservation, that will decrease the size as well, so if migrate
702 * reduced size we'd end up with a negative size. But for the
703 * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
704 * but we could in fact do this reserve/migrate dance several times
705 * between the time we did the original reservation and we'd clean it
706 * up. So to take care of this, release the space for the meta
707 * reservation here. I think it may be time for a documentation page on
708 * how block rsvs. work.
709 */
635 if (!ret) 710 if (!ret)
636 node->bytes_reserved = num_bytes; 711 node->bytes_reserved = num_bytes;
637 712
713 if (release)
714 btrfs_block_rsv_release(root, src_rsv, num_bytes);
715
638 return ret; 716 return ret;
639} 717}
640 718
@@ -646,7 +724,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
646 if (!node->bytes_reserved) 724 if (!node->bytes_reserved)
647 return; 725 return;
648 726
649 rsv = &root->fs_info->global_block_rsv; 727 rsv = &root->fs_info->delayed_block_rsv;
650 btrfs_block_rsv_release(root, rsv, 728 btrfs_block_rsv_release(root, rsv,
651 node->bytes_reserved); 729 node->bytes_reserved);
652 node->bytes_reserved = 0; 730 node->bytes_reserved = 0;
@@ -1026,7 +1104,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
1026 path->leave_spinning = 1; 1104 path->leave_spinning = 1;
1027 1105
1028 block_rsv = trans->block_rsv; 1106 block_rsv = trans->block_rsv;
1029 trans->block_rsv = &root->fs_info->global_block_rsv; 1107 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1030 1108
1031 delayed_root = btrfs_get_delayed_root(root); 1109 delayed_root = btrfs_get_delayed_root(root);
1032 1110
@@ -1069,7 +1147,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
1069 path->leave_spinning = 1; 1147 path->leave_spinning = 1;
1070 1148
1071 block_rsv = trans->block_rsv; 1149 block_rsv = trans->block_rsv;
1072 trans->block_rsv = &node->root->fs_info->global_block_rsv; 1150 trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
1073 1151
1074 ret = btrfs_insert_delayed_items(trans, path, node->root, node); 1152 ret = btrfs_insert_delayed_items(trans, path, node->root, node);
1075 if (!ret) 1153 if (!ret)
@@ -1149,7 +1227,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
1149 goto free_path; 1227 goto free_path;
1150 1228
1151 block_rsv = trans->block_rsv; 1229 block_rsv = trans->block_rsv;
1152 trans->block_rsv = &root->fs_info->global_block_rsv; 1230 trans->block_rsv = &root->fs_info->delayed_block_rsv;
1153 1231
1154 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node); 1232 ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
1155 if (!ret) 1233 if (!ret)
@@ -1641,7 +1719,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
1641 inode->i_gid = btrfs_stack_inode_gid(inode_item); 1719 inode->i_gid = btrfs_stack_inode_gid(inode_item);
1642 btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); 1720 btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
1643 inode->i_mode = btrfs_stack_inode_mode(inode_item); 1721 inode->i_mode = btrfs_stack_inode_mode(inode_item);
1644 inode->i_nlink = btrfs_stack_inode_nlink(inode_item); 1722 set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
1645 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); 1723 inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
1646 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); 1724 BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
1647 BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item); 1725 BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
@@ -1685,12 +1763,10 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
1685 goto release_node; 1763 goto release_node;
1686 } 1764 }
1687 1765
1688 ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); 1766 ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
1689 /* 1767 delayed_node);
1690 * we must reserve enough space when we start a new transaction, 1768 if (ret)
1691 * so reserving metadata failure is impossible 1769 goto release_node;
1692 */
1693 BUG_ON(ret);
1694 1770
1695 fill_stack_inode_item(trans, &delayed_node->inode_item, inode); 1771 fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
1696 delayed_node->inode_dirty = 1; 1772 delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..632f8f3cc9db 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, 256static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
257 int verify) 257 int verify)
258{ 258{
259 u16 csum_size = 259 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
260 btrfs_super_csum_size(&root->fs_info->super_copy);
261 char *result = NULL; 260 char *result = NULL;
262 unsigned long len; 261 unsigned long len;
263 unsigned long cur_len; 262 unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
367 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); 366 clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
368 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; 367 io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
369 while (1) { 368 while (1) {
370 ret = read_extent_buffer_pages(io_tree, eb, start, 1, 369 ret = read_extent_buffer_pages(io_tree, eb, start,
370 WAIT_COMPLETE,
371 btree_get_extent, mirror_num); 371 btree_get_extent, mirror_num);
372 if (!ret && 372 if (!ret &&
373 !verify_parent_transid(io_tree, eb, parent_transid)) 373 !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,48 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE); 608 end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
609 end = eb->start + end - 1; 609 end = eb->start + end - 1;
610err: 610err:
611 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
612 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
613 btree_readahead_hook(root, eb, eb->start, ret);
614 }
615
611 free_extent_buffer(eb); 616 free_extent_buffer(eb);
612out: 617out:
613 return ret; 618 return ret;
614} 619}
615 620
621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end,
623 int mirror_num, struct extent_state *state)
624{
625 struct extent_io_tree *tree;
626 unsigned long len;
627 struct extent_buffer *eb;
628 struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
629
630 tree = &BTRFS_I(page->mapping->host)->io_tree;
631 if (page->private == EXTENT_PAGE_PRIVATE)
632 goto out;
633 if (!page->private)
634 goto out;
635
636 len = page->private >> 2;
637 WARN_ON(len == 0);
638
639 eb = alloc_extent_buffer(tree, start, len, page);
640 if (eb == NULL)
641 goto out;
642
643 if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
644 clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
645 btree_readahead_hook(root, eb, eb->start, -EIO);
646 }
647 free_extent_buffer(eb);
648
649out:
650 return -EIO; /* we fixed nothing */
651}
652
616static void end_workqueue_bio(struct bio *bio, int err) 653static void end_workqueue_bio(struct bio *bio, int err)
617{ 654{
618 struct end_io_wq *end_io_wq = bio->bi_private; 655 struct end_io_wq *end_io_wq = bio->bi_private;
@@ -908,7 +945,7 @@ static int btree_readpage(struct file *file, struct page *page)
908{ 945{
909 struct extent_io_tree *tree; 946 struct extent_io_tree *tree;
910 tree = &BTRFS_I(page->mapping->host)->io_tree; 947 tree = &BTRFS_I(page->mapping->host)->io_tree;
911 return extent_read_full_page(tree, page, btree_get_extent); 948 return extent_read_full_page(tree, page, btree_get_extent, 0);
912} 949}
913 950
914static int btree_releasepage(struct page *page, gfp_t gfp_flags) 951static int btree_releasepage(struct page *page, gfp_t gfp_flags)
@@ -974,11 +1011,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
974 if (!buf) 1011 if (!buf)
975 return 0; 1012 return 0;
976 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, 1013 read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
977 buf, 0, 0, btree_get_extent, 0); 1014 buf, 0, WAIT_NONE, btree_get_extent, 0);
978 free_extent_buffer(buf); 1015 free_extent_buffer(buf);
979 return ret; 1016 return ret;
980} 1017}
981 1018
1019int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
1020 int mirror_num, struct extent_buffer **eb)
1021{
1022 struct extent_buffer *buf = NULL;
1023 struct inode *btree_inode = root->fs_info->btree_inode;
1024 struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
1025 int ret;
1026
1027 buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
1028 if (!buf)
1029 return 0;
1030
1031 set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
1032
1033 ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
1034 btree_get_extent, mirror_num);
1035 if (ret) {
1036 free_extent_buffer(buf);
1037 return ret;
1038 }
1039
1040 if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
1041 free_extent_buffer(buf);
1042 return -EIO;
1043 } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
1044 *eb = buf;
1045 } else {
1046 free_extent_buffer(buf);
1047 }
1048 return 0;
1049}
1050
982struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, 1051struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
983 u64 bytenr, u32 blocksize) 1052 u64 bytenr, u32 blocksize)
984{ 1053{
@@ -1135,10 +1204,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
1135 1204
1136 generation = btrfs_root_generation(&root->root_item); 1205 generation = btrfs_root_generation(&root->root_item);
1137 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); 1206 blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
1207 root->commit_root = NULL;
1138 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), 1208 root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
1139 blocksize, generation); 1209 blocksize, generation);
1140 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) { 1210 if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
1141 free_extent_buffer(root->node); 1211 free_extent_buffer(root->node);
1212 root->node = NULL;
1142 return -EIO; 1213 return -EIO;
1143 } 1214 }
1144 root->commit_root = btrfs_root_node(root); 1215 root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1648,235 @@ sleep:
1577 return 0; 1648 return 0;
1578} 1649}
1579 1650
1651/*
1652 * this will find the highest generation in the array of
1653 * root backups. The index of the highest array is returned,
1654 * or -1 if we can't find anything.
1655 *
1656 * We check to make sure the array is valid by comparing the
1657 * generation of the latest root in the array with the generation
1658 * in the super block. If they don't match we pitch it.
1659 */
1660static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
1661{
1662 u64 cur;
1663 int newest_index = -1;
1664 struct btrfs_root_backup *root_backup;
1665 int i;
1666
1667 for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
1668 root_backup = info->super_copy->super_roots + i;
1669 cur = btrfs_backup_tree_root_gen(root_backup);
1670 if (cur == newest_gen)
1671 newest_index = i;
1672 }
1673
1674 /* check to see if we actually wrapped around */
1675 if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
1676 root_backup = info->super_copy->super_roots;
1677 cur = btrfs_backup_tree_root_gen(root_backup);
1678 if (cur == newest_gen)
1679 newest_index = 0;
1680 }
1681 return newest_index;
1682}
1683
1684
1685/*
1686 * find the oldest backup so we know where to store new entries
1687 * in the backup array. This will set the backup_root_index
1688 * field in the fs_info struct
1689 */
1690static void find_oldest_super_backup(struct btrfs_fs_info *info,
1691 u64 newest_gen)
1692{
1693 int newest_index = -1;
1694
1695 newest_index = find_newest_super_backup(info, newest_gen);
1696 /* if there was garbage in there, just move along */
1697 if (newest_index == -1) {
1698 info->backup_root_index = 0;
1699 } else {
1700 info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
1701 }
1702}
1703
1704/*
1705 * copy all the root pointers into the super backup array.
1706 * this will bump the backup pointer by one when it is
1707 * done
1708 */
1709static void backup_super_roots(struct btrfs_fs_info *info)
1710{
1711 int next_backup;
1712 struct btrfs_root_backup *root_backup;
1713 int last_backup;
1714
1715 next_backup = info->backup_root_index;
1716 last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
1717 BTRFS_NUM_BACKUP_ROOTS;
1718
1719 /*
1720 * just overwrite the last backup if we're at the same generation
1721 * this happens only at umount
1722 */
1723 root_backup = info->super_for_commit->super_roots + last_backup;
1724 if (btrfs_backup_tree_root_gen(root_backup) ==
1725 btrfs_header_generation(info->tree_root->node))
1726 next_backup = last_backup;
1727
1728 root_backup = info->super_for_commit->super_roots + next_backup;
1729
1730 /*
1731 * make sure all of our padding and empty slots get zero filled
1732 * regardless of which ones we use today
1733 */
1734 memset(root_backup, 0, sizeof(*root_backup));
1735
1736 info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
1737
1738 btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
1739 btrfs_set_backup_tree_root_gen(root_backup,
1740 btrfs_header_generation(info->tree_root->node));
1741
1742 btrfs_set_backup_tree_root_level(root_backup,
1743 btrfs_header_level(info->tree_root->node));
1744
1745 btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
1746 btrfs_set_backup_chunk_root_gen(root_backup,
1747 btrfs_header_generation(info->chunk_root->node));
1748 btrfs_set_backup_chunk_root_level(root_backup,
1749 btrfs_header_level(info->chunk_root->node));
1750
1751 btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
1752 btrfs_set_backup_extent_root_gen(root_backup,
1753 btrfs_header_generation(info->extent_root->node));
1754 btrfs_set_backup_extent_root_level(root_backup,
1755 btrfs_header_level(info->extent_root->node));
1756
1757 /*
1758 * we might commit during log recovery, which happens before we set
1759 * the fs_root. Make sure it is valid before we fill it in.
1760 */
1761 if (info->fs_root && info->fs_root->node) {
1762 btrfs_set_backup_fs_root(root_backup,
1763 info->fs_root->node->start);
1764 btrfs_set_backup_fs_root_gen(root_backup,
1765 btrfs_header_generation(info->fs_root->node));
1766 btrfs_set_backup_fs_root_level(root_backup,
1767 btrfs_header_level(info->fs_root->node));
1768 }
1769
1770 btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
1771 btrfs_set_backup_dev_root_gen(root_backup,
1772 btrfs_header_generation(info->dev_root->node));
1773 btrfs_set_backup_dev_root_level(root_backup,
1774 btrfs_header_level(info->dev_root->node));
1775
1776 btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
1777 btrfs_set_backup_csum_root_gen(root_backup,
1778 btrfs_header_generation(info->csum_root->node));
1779 btrfs_set_backup_csum_root_level(root_backup,
1780 btrfs_header_level(info->csum_root->node));
1781
1782 btrfs_set_backup_total_bytes(root_backup,
1783 btrfs_super_total_bytes(info->super_copy));
1784 btrfs_set_backup_bytes_used(root_backup,
1785 btrfs_super_bytes_used(info->super_copy));
1786 btrfs_set_backup_num_devices(root_backup,
1787 btrfs_super_num_devices(info->super_copy));
1788
1789 /*
1790 * if we don't copy this out to the super_copy, it won't get remembered
1791 * for the next commit
1792 */
1793 memcpy(&info->super_copy->super_roots,
1794 &info->super_for_commit->super_roots,
1795 sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
1796}
1797
1798/*
1799 * this copies info out of the root backup array and back into
1800 * the in-memory super block. It is meant to help iterate through
1801 * the array, so you send it the number of backups you've already
1802 * tried and the last backup index you used.
1803 *
1804 * this returns -1 when it has tried all the backups
1805 */
1806static noinline int next_root_backup(struct btrfs_fs_info *info,
1807 struct btrfs_super_block *super,
1808 int *num_backups_tried, int *backup_index)
1809{
1810 struct btrfs_root_backup *root_backup;
1811 int newest = *backup_index;
1812
1813 if (*num_backups_tried == 0) {
1814 u64 gen = btrfs_super_generation(super);
1815
1816 newest = find_newest_super_backup(info, gen);
1817 if (newest == -1)
1818 return -1;
1819
1820 *backup_index = newest;
1821 *num_backups_tried = 1;
1822 } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
1823 /* we've tried all the backups, all done */
1824 return -1;
1825 } else {
1826 /* jump to the next oldest backup */
1827 newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
1828 BTRFS_NUM_BACKUP_ROOTS;
1829 *backup_index = newest;
1830 *num_backups_tried += 1;
1831 }
1832 root_backup = super->super_roots + newest;
1833
1834 btrfs_set_super_generation(super,
1835 btrfs_backup_tree_root_gen(root_backup));
1836 btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
1837 btrfs_set_super_root_level(super,
1838 btrfs_backup_tree_root_level(root_backup));
1839 btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
1840
1841 /*
1842 * fixme: the total bytes and num_devices need to match or we should
1843 * need a fsck
1844 */
1845 btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
1846 btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
1847 return 0;
1848}
1849
1850/* helper to cleanup tree roots */
1851static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
1852{
1853 free_extent_buffer(info->tree_root->node);
1854 free_extent_buffer(info->tree_root->commit_root);
1855 free_extent_buffer(info->dev_root->node);
1856 free_extent_buffer(info->dev_root->commit_root);
1857 free_extent_buffer(info->extent_root->node);
1858 free_extent_buffer(info->extent_root->commit_root);
1859 free_extent_buffer(info->csum_root->node);
1860 free_extent_buffer(info->csum_root->commit_root);
1861
1862 info->tree_root->node = NULL;
1863 info->tree_root->commit_root = NULL;
1864 info->dev_root->node = NULL;
1865 info->dev_root->commit_root = NULL;
1866 info->extent_root->node = NULL;
1867 info->extent_root->commit_root = NULL;
1868 info->csum_root->node = NULL;
1869 info->csum_root->commit_root = NULL;
1870
1871 if (chunk_root) {
1872 free_extent_buffer(info->chunk_root->node);
1873 free_extent_buffer(info->chunk_root->commit_root);
1874 info->chunk_root->node = NULL;
1875 info->chunk_root->commit_root = NULL;
1876 }
1877}
1878
1879
1580struct btrfs_root *open_ctree(struct super_block *sb, 1880struct btrfs_root *open_ctree(struct super_block *sb,
1581 struct btrfs_fs_devices *fs_devices, 1881 struct btrfs_fs_devices *fs_devices,
1582 char *options) 1882 char *options)
@@ -1590,29 +1890,32 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1590 u64 features; 1890 u64 features;
1591 struct btrfs_key location; 1891 struct btrfs_key location;
1592 struct buffer_head *bh; 1892 struct buffer_head *bh;
1593 struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), 1893 struct btrfs_super_block *disk_super;
1594 GFP_NOFS);
1595 struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
1596 GFP_NOFS);
1597 struct btrfs_root *tree_root = btrfs_sb(sb); 1894 struct btrfs_root *tree_root = btrfs_sb(sb);
1598 struct btrfs_fs_info *fs_info = NULL; 1895 struct btrfs_fs_info *fs_info = tree_root->fs_info;
1599 struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), 1896 struct btrfs_root *extent_root;
1600 GFP_NOFS); 1897 struct btrfs_root *csum_root;
1601 struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), 1898 struct btrfs_root *chunk_root;
1602 GFP_NOFS); 1899 struct btrfs_root *dev_root;
1603 struct btrfs_root *log_tree_root; 1900 struct btrfs_root *log_tree_root;
1604
1605 int ret; 1901 int ret;
1606 int err = -EINVAL; 1902 int err = -EINVAL;
1607 1903 int num_backups_tried = 0;
1608 struct btrfs_super_block *disk_super; 1904 int backup_index = 0;
1609 1905
1610 if (!extent_root || !tree_root || !tree_root->fs_info || 1906 extent_root = fs_info->extent_root =
1611 !chunk_root || !dev_root || !csum_root) { 1907 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1908 csum_root = fs_info->csum_root =
1909 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1910 chunk_root = fs_info->chunk_root =
1911 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1912 dev_root = fs_info->dev_root =
1913 kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
1914
1915 if (!extent_root || !csum_root || !chunk_root || !dev_root) {
1612 err = -ENOMEM; 1916 err = -ENOMEM;
1613 goto fail; 1917 goto fail;
1614 } 1918 }
1615 fs_info = tree_root->fs_info;
1616 1919
1617 ret = init_srcu_struct(&fs_info->subvol_srcu); 1920 ret = init_srcu_struct(&fs_info->subvol_srcu);
1618 if (ret) { 1921 if (ret) {
@@ -1648,15 +1951,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1648 spin_lock_init(&fs_info->fs_roots_radix_lock); 1951 spin_lock_init(&fs_info->fs_roots_radix_lock);
1649 spin_lock_init(&fs_info->delayed_iput_lock); 1952 spin_lock_init(&fs_info->delayed_iput_lock);
1650 spin_lock_init(&fs_info->defrag_inodes_lock); 1953 spin_lock_init(&fs_info->defrag_inodes_lock);
1954 spin_lock_init(&fs_info->free_chunk_lock);
1651 mutex_init(&fs_info->reloc_mutex); 1955 mutex_init(&fs_info->reloc_mutex);
1652 1956
1653 init_completion(&fs_info->kobj_unregister); 1957 init_completion(&fs_info->kobj_unregister);
1654 fs_info->tree_root = tree_root;
1655 fs_info->extent_root = extent_root;
1656 fs_info->csum_root = csum_root;
1657 fs_info->chunk_root = chunk_root;
1658 fs_info->dev_root = dev_root;
1659 fs_info->fs_devices = fs_devices;
1660 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); 1958 INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
1661 INIT_LIST_HEAD(&fs_info->space_info); 1959 INIT_LIST_HEAD(&fs_info->space_info);
1662 btrfs_mapping_init(&fs_info->mapping_tree); 1960 btrfs_mapping_init(&fs_info->mapping_tree);
@@ -1665,8 +1963,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1665 btrfs_init_block_rsv(&fs_info->trans_block_rsv); 1963 btrfs_init_block_rsv(&fs_info->trans_block_rsv);
1666 btrfs_init_block_rsv(&fs_info->chunk_block_rsv); 1964 btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
1667 btrfs_init_block_rsv(&fs_info->empty_block_rsv); 1965 btrfs_init_block_rsv(&fs_info->empty_block_rsv);
1668 INIT_LIST_HEAD(&fs_info->durable_block_rsv_list); 1966 btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
1669 mutex_init(&fs_info->durable_block_rsv_mutex);
1670 atomic_set(&fs_info->nr_async_submits, 0); 1967 atomic_set(&fs_info->nr_async_submits, 0);
1671 atomic_set(&fs_info->async_delalloc_pages, 0); 1968 atomic_set(&fs_info->async_delalloc_pages, 0);
1672 atomic_set(&fs_info->async_submit_draining, 0); 1969 atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1974,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1677 fs_info->metadata_ratio = 0; 1974 fs_info->metadata_ratio = 0;
1678 fs_info->defrag_inodes = RB_ROOT; 1975 fs_info->defrag_inodes = RB_ROOT;
1679 fs_info->trans_no_join = 0; 1976 fs_info->trans_no_join = 0;
1977 fs_info->free_chunk_space = 0;
1978
1979 /* readahead state */
1980 INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
1981 spin_lock_init(&fs_info->reada_lock);
1680 1982
1681 fs_info->thread_pool_size = min_t(unsigned long, 1983 fs_info->thread_pool_size = min_t(unsigned long,
1682 num_online_cpus() + 2, 8); 1984 num_online_cpus() + 2, 8);
@@ -1705,7 +2007,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1705 sb->s_bdi = &fs_info->bdi; 2007 sb->s_bdi = &fs_info->bdi;
1706 2008
1707 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID; 2009 fs_info->btree_inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
1708 fs_info->btree_inode->i_nlink = 1; 2010 set_nlink(fs_info->btree_inode, 1);
1709 /* 2011 /*
1710 * we set the i_size on the btree inode to the max possible int. 2012 * we set the i_size on the btree inode to the max possible int.
1711 * the real end of the address space is determined by all of 2013 * the real end of the address space is determined by all of
@@ -1766,14 +2068,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1766 goto fail_alloc; 2068 goto fail_alloc;
1767 } 2069 }
1768 2070
1769 memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); 2071 memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
1770 memcpy(&fs_info->super_for_commit, &fs_info->super_copy, 2072 memcpy(fs_info->super_for_commit, fs_info->super_copy,
1771 sizeof(fs_info->super_for_commit)); 2073 sizeof(*fs_info->super_for_commit));
1772 brelse(bh); 2074 brelse(bh);
1773 2075
1774 memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); 2076 memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
1775 2077
1776 disk_super = &fs_info->super_copy; 2078 disk_super = fs_info->super_copy;
1777 if (!btrfs_super_root(disk_super)) 2079 if (!btrfs_super_root(disk_super))
1778 goto fail_alloc; 2080 goto fail_alloc;
1779 2081
@@ -1783,6 +2085,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1783 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY); 2085 btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
1784 2086
1785 /* 2087 /*
2088 * run through our array of backup supers and setup
2089 * our ring pointer to the oldest one
2090 */
2091 generation = btrfs_super_generation(disk_super);
2092 find_oldest_super_backup(fs_info, generation);
2093
2094 /*
1786 * In the long term, we'll store the compression type in the super 2095 * In the long term, we'll store the compression type in the super
1787 * block, and it'll be used for per file compression control. 2096 * block, and it'll be used for per file compression control.
1788 */ 2097 */
@@ -1870,6 +2179,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1870 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta", 2179 btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
1871 fs_info->thread_pool_size, 2180 fs_info->thread_pool_size,
1872 &fs_info->generic_worker); 2181 &fs_info->generic_worker);
2182 btrfs_init_workers(&fs_info->readahead_workers, "readahead",
2183 fs_info->thread_pool_size,
2184 &fs_info->generic_worker);
1873 2185
1874 /* 2186 /*
1875 * endios are largely parallel and should have a very 2187 * endios are largely parallel and should have a very
@@ -1880,6 +2192,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1880 2192
1881 fs_info->endio_write_workers.idle_thresh = 2; 2193 fs_info->endio_write_workers.idle_thresh = 2;
1882 fs_info->endio_meta_write_workers.idle_thresh = 2; 2194 fs_info->endio_meta_write_workers.idle_thresh = 2;
2195 fs_info->readahead_workers.idle_thresh = 2;
1883 2196
1884 btrfs_start_workers(&fs_info->workers, 1); 2197 btrfs_start_workers(&fs_info->workers, 1);
1885 btrfs_start_workers(&fs_info->generic_worker, 1); 2198 btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2206,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1893 btrfs_start_workers(&fs_info->endio_freespace_worker, 1); 2206 btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
1894 btrfs_start_workers(&fs_info->delayed_workers, 1); 2207 btrfs_start_workers(&fs_info->delayed_workers, 1);
1895 btrfs_start_workers(&fs_info->caching_workers, 1); 2208 btrfs_start_workers(&fs_info->caching_workers, 1);
2209 btrfs_start_workers(&fs_info->readahead_workers, 1);
1896 2210
1897 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); 2211 fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
1898 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, 2212 fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2253,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1939 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { 2253 if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
1940 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", 2254 printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
1941 sb->s_id); 2255 sb->s_id);
1942 goto fail_chunk_root; 2256 goto fail_tree_roots;
1943 } 2257 }
1944 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); 2258 btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
1945 chunk_root->commit_root = btrfs_root_node(chunk_root); 2259 chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2268,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1954 if (ret) { 2268 if (ret) {
1955 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", 2269 printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
1956 sb->s_id); 2270 sb->s_id);
1957 goto fail_chunk_root; 2271 goto fail_tree_roots;
1958 } 2272 }
1959 2273
1960 btrfs_close_extra_devices(fs_devices); 2274 btrfs_close_extra_devices(fs_devices);
1961 2275
2276retry_root_backup:
1962 blocksize = btrfs_level_size(tree_root, 2277 blocksize = btrfs_level_size(tree_root,
1963 btrfs_super_root_level(disk_super)); 2278 btrfs_super_root_level(disk_super));
1964 generation = btrfs_super_generation(disk_super); 2279 generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2281,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
1966 tree_root->node = read_tree_block(tree_root, 2281 tree_root->node = read_tree_block(tree_root,
1967 btrfs_super_root(disk_super), 2282 btrfs_super_root(disk_super),
1968 blocksize, generation); 2283 blocksize, generation);
1969 if (!tree_root->node) 2284 if (!tree_root->node ||
1970 goto fail_chunk_root; 2285 !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1971 if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
1972 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", 2286 printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
1973 sb->s_id); 2287 sb->s_id);
1974 goto fail_tree_root; 2288
2289 goto recovery_tree_root;
1975 } 2290 }
2291
1976 btrfs_set_root_node(&tree_root->root_item, tree_root->node); 2292 btrfs_set_root_node(&tree_root->root_item, tree_root->node);
1977 tree_root->commit_root = btrfs_root_node(tree_root); 2293 tree_root->commit_root = btrfs_root_node(tree_root);
1978 2294
1979 ret = find_and_setup_root(tree_root, fs_info, 2295 ret = find_and_setup_root(tree_root, fs_info,
1980 BTRFS_EXTENT_TREE_OBJECTID, extent_root); 2296 BTRFS_EXTENT_TREE_OBJECTID, extent_root);
1981 if (ret) 2297 if (ret)
1982 goto fail_tree_root; 2298 goto recovery_tree_root;
1983 extent_root->track_dirty = 1; 2299 extent_root->track_dirty = 1;
1984 2300
1985 ret = find_and_setup_root(tree_root, fs_info, 2301 ret = find_and_setup_root(tree_root, fs_info,
1986 BTRFS_DEV_TREE_OBJECTID, dev_root); 2302 BTRFS_DEV_TREE_OBJECTID, dev_root);
1987 if (ret) 2303 if (ret)
1988 goto fail_extent_root; 2304 goto recovery_tree_root;
1989 dev_root->track_dirty = 1; 2305 dev_root->track_dirty = 1;
1990 2306
1991 ret = find_and_setup_root(tree_root, fs_info, 2307 ret = find_and_setup_root(tree_root, fs_info,
1992 BTRFS_CSUM_TREE_OBJECTID, csum_root); 2308 BTRFS_CSUM_TREE_OBJECTID, csum_root);
1993 if (ret) 2309 if (ret)
1994 goto fail_dev_root; 2310 goto recovery_tree_root;
1995 2311
1996 csum_root->track_dirty = 1; 2312 csum_root->track_dirty = 1;
1997 2313
@@ -2124,22 +2440,13 @@ fail_cleaner:
2124 2440
2125fail_block_groups: 2441fail_block_groups:
2126 btrfs_free_block_groups(fs_info); 2442 btrfs_free_block_groups(fs_info);
2127 free_extent_buffer(csum_root->node); 2443
2128 free_extent_buffer(csum_root->commit_root); 2444fail_tree_roots:
2129fail_dev_root: 2445 free_root_pointers(fs_info, 1);
2130 free_extent_buffer(dev_root->node); 2446
2131 free_extent_buffer(dev_root->commit_root);
2132fail_extent_root:
2133 free_extent_buffer(extent_root->node);
2134 free_extent_buffer(extent_root->commit_root);
2135fail_tree_root:
2136 free_extent_buffer(tree_root->node);
2137 free_extent_buffer(tree_root->commit_root);
2138fail_chunk_root:
2139 free_extent_buffer(chunk_root->node);
2140 free_extent_buffer(chunk_root->commit_root);
2141fail_sb_buffer: 2447fail_sb_buffer:
2142 btrfs_stop_workers(&fs_info->generic_worker); 2448 btrfs_stop_workers(&fs_info->generic_worker);
2449 btrfs_stop_workers(&fs_info->readahead_workers);
2143 btrfs_stop_workers(&fs_info->fixup_workers); 2450 btrfs_stop_workers(&fs_info->fixup_workers);
2144 btrfs_stop_workers(&fs_info->delalloc_workers); 2451 btrfs_stop_workers(&fs_info->delalloc_workers);
2145 btrfs_stop_workers(&fs_info->workers); 2452 btrfs_stop_workers(&fs_info->workers);
@@ -2152,25 +2459,37 @@ fail_sb_buffer:
2152 btrfs_stop_workers(&fs_info->delayed_workers); 2459 btrfs_stop_workers(&fs_info->delayed_workers);
2153 btrfs_stop_workers(&fs_info->caching_workers); 2460 btrfs_stop_workers(&fs_info->caching_workers);
2154fail_alloc: 2461fail_alloc:
2155 kfree(fs_info->delayed_root);
2156fail_iput: 2462fail_iput:
2463 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2464
2157 invalidate_inode_pages2(fs_info->btree_inode->i_mapping); 2465 invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
2158 iput(fs_info->btree_inode); 2466 iput(fs_info->btree_inode);
2159
2160 btrfs_close_devices(fs_info->fs_devices);
2161 btrfs_mapping_tree_free(&fs_info->mapping_tree);
2162fail_bdi: 2467fail_bdi:
2163 bdi_destroy(&fs_info->bdi); 2468 bdi_destroy(&fs_info->bdi);
2164fail_srcu: 2469fail_srcu:
2165 cleanup_srcu_struct(&fs_info->subvol_srcu); 2470 cleanup_srcu_struct(&fs_info->subvol_srcu);
2166fail: 2471fail:
2167 kfree(extent_root); 2472 btrfs_close_devices(fs_info->fs_devices);
2168 kfree(tree_root); 2473 free_fs_info(fs_info);
2169 kfree(fs_info);
2170 kfree(chunk_root);
2171 kfree(dev_root);
2172 kfree(csum_root);
2173 return ERR_PTR(err); 2474 return ERR_PTR(err);
2475
2476recovery_tree_root:
2477 if (!btrfs_test_opt(tree_root, RECOVERY))
2478 goto fail_tree_roots;
2479
2480 free_root_pointers(fs_info, 0);
2481
2482 /* don't use the log in recovery mode, it won't be valid */
2483 btrfs_set_super_log_root(disk_super, 0);
2484
2485 /* we can't trust the free space cache either */
2486 btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
2487
2488 ret = next_root_backup(fs_info, fs_info->super_copy,
2489 &num_backups_tried, &backup_index);
2490 if (ret == -1)
2491 goto fail_block_groups;
2492 goto retry_root_backup;
2174} 2493}
2175 2494
2176static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) 2495static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2254,22 +2573,10 @@ static int write_dev_supers(struct btrfs_device *device,
2254 int errors = 0; 2573 int errors = 0;
2255 u32 crc; 2574 u32 crc;
2256 u64 bytenr; 2575 u64 bytenr;
2257 int last_barrier = 0;
2258 2576
2259 if (max_mirrors == 0) 2577 if (max_mirrors == 0)
2260 max_mirrors = BTRFS_SUPER_MIRROR_MAX; 2578 max_mirrors = BTRFS_SUPER_MIRROR_MAX;
2261 2579
2262 /* make sure only the last submit_bh does a barrier */
2263 if (do_barriers) {
2264 for (i = 0; i < max_mirrors; i++) {
2265 bytenr = btrfs_sb_offset(i);
2266 if (bytenr + BTRFS_SUPER_INFO_SIZE >=
2267 device->total_bytes)
2268 break;
2269 last_barrier = i;
2270 }
2271 }
2272
2273 for (i = 0; i < max_mirrors; i++) { 2580 for (i = 0; i < max_mirrors; i++) {
2274 bytenr = btrfs_sb_offset(i); 2581 bytenr = btrfs_sb_offset(i);
2275 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) 2582 if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
@@ -2315,17 +2622,136 @@ static int write_dev_supers(struct btrfs_device *device,
2315 bh->b_end_io = btrfs_end_buffer_write_sync; 2622 bh->b_end_io = btrfs_end_buffer_write_sync;
2316 } 2623 }
2317 2624
2318 if (i == last_barrier && do_barriers) 2625 /*
2319 ret = submit_bh(WRITE_FLUSH_FUA, bh); 2626 * we fua the first super. The others we allow
2320 else 2627 * to go down lazy.
2321 ret = submit_bh(WRITE_SYNC, bh); 2628 */
2322 2629 ret = submit_bh(WRITE_FUA, bh);
2323 if (ret) 2630 if (ret)
2324 errors++; 2631 errors++;
2325 } 2632 }
2326 return errors < i ? 0 : -1; 2633 return errors < i ? 0 : -1;
2327} 2634}
2328 2635
2636/*
2637 * endio for the write_dev_flush, this will wake anyone waiting
2638 * for the barrier when it is done
2639 */
2640static void btrfs_end_empty_barrier(struct bio *bio, int err)
2641{
2642 if (err) {
2643 if (err == -EOPNOTSUPP)
2644 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2645 clear_bit(BIO_UPTODATE, &bio->bi_flags);
2646 }
2647 if (bio->bi_private)
2648 complete(bio->bi_private);
2649 bio_put(bio);
2650}
2651
2652/*
2653 * trigger flushes for one the devices. If you pass wait == 0, the flushes are
2654 * sent down. With wait == 1, it waits for the previous flush.
2655 *
2656 * any device where the flush fails with eopnotsupp are flagged as not-barrier
2657 * capable
2658 */
2659static int write_dev_flush(struct btrfs_device *device, int wait)
2660{
2661 struct bio *bio;
2662 int ret = 0;
2663
2664 if (device->nobarriers)
2665 return 0;
2666
2667 if (wait) {
2668 bio = device->flush_bio;
2669 if (!bio)
2670 return 0;
2671
2672 wait_for_completion(&device->flush_wait);
2673
2674 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
2675 printk("btrfs: disabling barriers on dev %s\n",
2676 device->name);
2677 device->nobarriers = 1;
2678 }
2679 if (!bio_flagged(bio, BIO_UPTODATE)) {
2680 ret = -EIO;
2681 }
2682
2683 /* drop the reference from the wait == 0 run */
2684 bio_put(bio);
2685 device->flush_bio = NULL;
2686
2687 return ret;
2688 }
2689
2690 /*
2691 * one reference for us, and we leave it for the
2692 * caller
2693 */
2694 device->flush_bio = NULL;;
2695 bio = bio_alloc(GFP_NOFS, 0);
2696 if (!bio)
2697 return -ENOMEM;
2698
2699 bio->bi_end_io = btrfs_end_empty_barrier;
2700 bio->bi_bdev = device->bdev;
2701 init_completion(&device->flush_wait);
2702 bio->bi_private = &device->flush_wait;
2703 device->flush_bio = bio;
2704
2705 bio_get(bio);
2706 submit_bio(WRITE_FLUSH, bio);
2707
2708 return 0;
2709}
2710
2711/*
2712 * send an empty flush down to each device in parallel,
2713 * then wait for them
2714 */
2715static int barrier_all_devices(struct btrfs_fs_info *info)
2716{
2717 struct list_head *head;
2718 struct btrfs_device *dev;
2719 int errors = 0;
2720 int ret;
2721
2722 /* send down all the barriers */
2723 head = &info->fs_devices->devices;
2724 list_for_each_entry_rcu(dev, head, dev_list) {
2725 if (!dev->bdev) {
2726 errors++;
2727 continue;
2728 }
2729 if (!dev->in_fs_metadata || !dev->writeable)
2730 continue;
2731
2732 ret = write_dev_flush(dev, 0);
2733 if (ret)
2734 errors++;
2735 }
2736
2737 /* wait for all the barriers */
2738 list_for_each_entry_rcu(dev, head, dev_list) {
2739 if (!dev->bdev) {
2740 errors++;
2741 continue;
2742 }
2743 if (!dev->in_fs_metadata || !dev->writeable)
2744 continue;
2745
2746 ret = write_dev_flush(dev, 1);
2747 if (ret)
2748 errors++;
2749 }
2750 if (errors)
2751 return -EIO;
2752 return 0;
2753}
2754
2329int write_all_supers(struct btrfs_root *root, int max_mirrors) 2755int write_all_supers(struct btrfs_root *root, int max_mirrors)
2330{ 2756{
2331 struct list_head *head; 2757 struct list_head *head;
@@ -2338,14 +2764,19 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
2338 int total_errors = 0; 2764 int total_errors = 0;
2339 u64 flags; 2765 u64 flags;
2340 2766
2341 max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 2767 max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
2342 do_barriers = !btrfs_test_opt(root, NOBARRIER); 2768 do_barriers = !btrfs_test_opt(root, NOBARRIER);
2769 backup_super_roots(root->fs_info);
2343 2770
2344 sb = &root->fs_info->super_for_commit; 2771 sb = root->fs_info->super_for_commit;
2345 dev_item = &sb->dev_item; 2772 dev_item = &sb->dev_item;
2346 2773
2347 mutex_lock(&root->fs_info->fs_devices->device_list_mutex); 2774 mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2348 head = &root->fs_info->fs_devices->devices; 2775 head = &root->fs_info->fs_devices->devices;
2776
2777 if (do_barriers)
2778 barrier_all_devices(root->fs_info);
2779
2349 list_for_each_entry_rcu(dev, head, dev_list) { 2780 list_for_each_entry_rcu(dev, head, dev_list) {
2350 if (!dev->bdev) { 2781 if (!dev->bdev) {
2351 total_errors++; 2782 total_errors++;
@@ -2545,8 +2976,6 @@ int close_ctree(struct btrfs_root *root)
2545 /* clear out the rbtree of defraggable inodes */ 2976 /* clear out the rbtree of defraggable inodes */
2546 btrfs_run_defrag_inodes(root->fs_info); 2977 btrfs_run_defrag_inodes(root->fs_info);
2547 2978
2548 btrfs_put_block_group_cache(fs_info);
2549
2550 /* 2979 /*
2551 * Here come 2 situations when btrfs is broken to flip readonly: 2980 * Here come 2 situations when btrfs is broken to flip readonly:
2552 * 2981 *
@@ -2572,6 +3001,8 @@ int close_ctree(struct btrfs_root *root)
2572 printk(KERN_ERR "btrfs: commit super ret %d\n", ret); 3001 printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
2573 } 3002 }
2574 3003
3004 btrfs_put_block_group_cache(fs_info);
3005
2575 kthread_stop(root->fs_info->transaction_kthread); 3006 kthread_stop(root->fs_info->transaction_kthread);
2576 kthread_stop(root->fs_info->cleaner_kthread); 3007 kthread_stop(root->fs_info->cleaner_kthread);
2577 3008
@@ -2603,7 +3034,6 @@ int close_ctree(struct btrfs_root *root)
2603 del_fs_roots(fs_info); 3034 del_fs_roots(fs_info);
2604 3035
2605 iput(fs_info->btree_inode); 3036 iput(fs_info->btree_inode);
2606 kfree(fs_info->delayed_root);
2607 3037
2608 btrfs_stop_workers(&fs_info->generic_worker); 3038 btrfs_stop_workers(&fs_info->generic_worker);
2609 btrfs_stop_workers(&fs_info->fixup_workers); 3039 btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +3047,7 @@ int close_ctree(struct btrfs_root *root)
2617 btrfs_stop_workers(&fs_info->submit_workers); 3047 btrfs_stop_workers(&fs_info->submit_workers);
2618 btrfs_stop_workers(&fs_info->delayed_workers); 3048 btrfs_stop_workers(&fs_info->delayed_workers);
2619 btrfs_stop_workers(&fs_info->caching_workers); 3049 btrfs_stop_workers(&fs_info->caching_workers);
3050 btrfs_stop_workers(&fs_info->readahead_workers);
2620 3051
2621 btrfs_close_devices(fs_info->fs_devices); 3052 btrfs_close_devices(fs_info->fs_devices);
2622 btrfs_mapping_tree_free(&fs_info->mapping_tree); 3053 btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +3055,7 @@ int close_ctree(struct btrfs_root *root)
2624 bdi_destroy(&fs_info->bdi); 3055 bdi_destroy(&fs_info->bdi);
2625 cleanup_srcu_struct(&fs_info->subvol_srcu); 3056 cleanup_srcu_struct(&fs_info->subvol_srcu);
2626 3057
2627 kfree(fs_info->extent_root); 3058 free_fs_info(fs_info);
2628 kfree(fs_info->tree_root);
2629 kfree(fs_info->chunk_root);
2630 kfree(fs_info->dev_root);
2631 kfree(fs_info->csum_root);
2632 kfree(fs_info);
2633 3059
2634 return 0; 3060 return 0;
2635} 3061}
@@ -2735,7 +3161,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
2735 return ret; 3161 return ret;
2736} 3162}
2737 3163
2738int btree_lock_page_hook(struct page *page) 3164static int btree_lock_page_hook(struct page *page, void *data,
3165 void (*flush_fn)(void *))
2739{ 3166{
2740 struct inode *inode = page->mapping->host; 3167 struct inode *inode = page->mapping->host;
2741 struct btrfs_root *root = BTRFS_I(inode)->root; 3168 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3179,10 @@ int btree_lock_page_hook(struct page *page)
2752 if (!eb) 3179 if (!eb)
2753 goto out; 3180 goto out;
2754 3181
2755 btrfs_tree_lock(eb); 3182 if (!btrfs_try_tree_write_lock(eb)) {
3183 flush_fn(data);
3184 btrfs_tree_lock(eb);
3185 }
2756 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); 3186 btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
2757 3187
2758 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { 3188 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3197,10 @@ int btree_lock_page_hook(struct page *page)
2767 btrfs_tree_unlock(eb); 3197 btrfs_tree_unlock(eb);
2768 free_extent_buffer(eb); 3198 free_extent_buffer(eb);
2769out: 3199out:
2770 lock_page(page); 3200 if (!trylock_page(page)) {
3201 flush_fn(data);
3202 lock_page(page);
3203 }
2771 return 0; 3204 return 0;
2772} 3205}
2773 3206
@@ -3123,6 +3556,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
3123static struct extent_io_ops btree_extent_io_ops = { 3556static struct extent_io_ops btree_extent_io_ops = {
3124 .write_cache_pages_lock_hook = btree_lock_page_hook, 3557 .write_cache_pages_lock_hook = btree_lock_page_hook,
3125 .readpage_end_io_hook = btree_readpage_end_io_hook, 3558 .readpage_end_io_hook = btree_readpage_end_io_hook,
3559 .readpage_io_failed_hook = btree_io_failed_hook,
3126 .submit_bio_hook = btree_submit_bio_hook, 3560 .submit_bio_hook = btree_submit_bio_hook,
3127 /* note we're sharing with inode.c for the merge bio hook */ 3561 /* note we're sharing with inode.c for the merge bio hook */
3128 .merge_bio_hook = btrfs_merge_bio_hook, 3562 .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
40 u32 blocksize, u64 parent_transid); 40 u32 blocksize, u64 parent_transid);
41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, 41int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
42 u64 parent_transid); 42 u64 parent_transid);
43int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
44 int mirror_num, struct extent_buffer **eb);
43struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, 45struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
44 u64 bytenr, u32 blocksize); 46 u64 bytenr, u32 blocksize);
45int clean_tree_block(struct btrfs_trans_handle *trans, 47int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
83 struct btrfs_fs_info *fs_info); 85 struct btrfs_fs_info *fs_info);
84int btrfs_add_log_tree(struct btrfs_trans_handle *trans, 86int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
85 struct btrfs_root *root); 87 struct btrfs_root *root);
86int btree_lock_page_hook(struct page *page);
87
88 88
89#ifdef CONFIG_DEBUG_LOCK_ALLOC 89#ifdef CONFIG_DEBUG_LOCK_ALLOC
90void btrfs_init_lockdep(void); 90void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..f0d5718d2587 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
23#include <linux/rcupdate.h> 23#include <linux/rcupdate.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/slab.h> 25#include <linux/slab.h>
26#include <linux/ratelimit.h>
26#include "compat.h" 27#include "compat.h"
27#include "hash.h" 28#include "hash.h"
28#include "ctree.h" 29#include "ctree.h"
@@ -52,6 +53,21 @@ enum {
52 CHUNK_ALLOC_LIMITED = 2, 53 CHUNK_ALLOC_LIMITED = 2,
53}; 54};
54 55
56/*
57 * Control how reservations are dealt with.
58 *
59 * RESERVE_FREE - freeing a reservation.
60 * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
61 * ENOSPC accounting
62 * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
63 * bytes_may_use as the ENOSPC accounting is done elsewhere
64 */
65enum {
66 RESERVE_FREE = 0,
67 RESERVE_ALLOC = 1,
68 RESERVE_ALLOC_NO_ACCOUNT = 2,
69};
70
55static int update_block_group(struct btrfs_trans_handle *trans, 71static int update_block_group(struct btrfs_trans_handle *trans,
56 struct btrfs_root *root, 72 struct btrfs_root *root,
57 u64 bytenr, u64 num_bytes, int alloc); 73 u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
81 struct btrfs_key *key); 97 struct btrfs_key *key);
82static void dump_space_info(struct btrfs_space_info *info, u64 bytes, 98static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
83 int dump_block_groups); 99 int dump_block_groups);
100static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
101 u64 num_bytes, int reserve);
84 102
85static noinline int 103static noinline int
86block_group_cache_done(struct btrfs_block_group_cache *cache) 104block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
104 if (atomic_dec_and_test(&cache->count)) { 122 if (atomic_dec_and_test(&cache->count)) {
105 WARN_ON(cache->pinned > 0); 123 WARN_ON(cache->pinned > 0);
106 WARN_ON(cache->reserved > 0); 124 WARN_ON(cache->reserved > 0);
107 WARN_ON(cache->reserved_pinned > 0);
108 kfree(cache->free_space_ctl); 125 kfree(cache->free_space_ctl);
109 kfree(cache); 126 kfree(cache);
110 } 127 }
@@ -450,13 +467,59 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
450 struct btrfs_root *root, 467 struct btrfs_root *root,
451 int load_cache_only) 468 int load_cache_only)
452{ 469{
470 DEFINE_WAIT(wait);
453 struct btrfs_fs_info *fs_info = cache->fs_info; 471 struct btrfs_fs_info *fs_info = cache->fs_info;
454 struct btrfs_caching_control *caching_ctl; 472 struct btrfs_caching_control *caching_ctl;
455 int ret = 0; 473 int ret = 0;
456 474
457 smp_mb(); 475 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
458 if (cache->cached != BTRFS_CACHE_NO) 476 BUG_ON(!caching_ctl);
477
478 INIT_LIST_HEAD(&caching_ctl->list);
479 mutex_init(&caching_ctl->mutex);
480 init_waitqueue_head(&caching_ctl->wait);
481 caching_ctl->block_group = cache;
482 caching_ctl->progress = cache->key.objectid;
483 atomic_set(&caching_ctl->count, 1);
484 caching_ctl->work.func = caching_thread;
485
486 spin_lock(&cache->lock);
487 /*
488 * This should be a rare occasion, but this could happen I think in the
489 * case where one thread starts to load the space cache info, and then
490 * some other thread starts a transaction commit which tries to do an
491 * allocation while the other thread is still loading the space cache
492 * info. The previous loop should have kept us from choosing this block
493 * group, but if we've moved to the state where we will wait on caching
494 * block groups we need to first check if we're doing a fast load here,
495 * so we can wait for it to finish, otherwise we could end up allocating
496 * from a block group who's cache gets evicted for one reason or
497 * another.
498 */
499 while (cache->cached == BTRFS_CACHE_FAST) {
500 struct btrfs_caching_control *ctl;
501
502 ctl = cache->caching_ctl;
503 atomic_inc(&ctl->count);
504 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
505 spin_unlock(&cache->lock);
506
507 schedule();
508
509 finish_wait(&ctl->wait, &wait);
510 put_caching_control(ctl);
511 spin_lock(&cache->lock);
512 }
513
514 if (cache->cached != BTRFS_CACHE_NO) {
515 spin_unlock(&cache->lock);
516 kfree(caching_ctl);
459 return 0; 517 return 0;
518 }
519 WARN_ON(cache->caching_ctl);
520 cache->caching_ctl = caching_ctl;
521 cache->cached = BTRFS_CACHE_FAST;
522 spin_unlock(&cache->lock);
460 523
461 /* 524 /*
462 * We can't do the read from on-disk cache during a commit since we need 525 * We can't do the read from on-disk cache during a commit since we need
@@ -465,57 +528,53 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
465 * we likely hold important locks. 528 * we likely hold important locks.
466 */ 529 */
467 if (trans && (!trans->transaction->in_commit) && 530 if (trans && (!trans->transaction->in_commit) &&
468 (root && root != root->fs_info->tree_root)) { 531 (root && root != root->fs_info->tree_root) &&
469 spin_lock(&cache->lock); 532 btrfs_test_opt(root, SPACE_CACHE)) {
470 if (cache->cached != BTRFS_CACHE_NO) {
471 spin_unlock(&cache->lock);
472 return 0;
473 }
474 cache->cached = BTRFS_CACHE_STARTED;
475 spin_unlock(&cache->lock);
476
477 ret = load_free_space_cache(fs_info, cache); 533 ret = load_free_space_cache(fs_info, cache);
478 534
479 spin_lock(&cache->lock); 535 spin_lock(&cache->lock);
480 if (ret == 1) { 536 if (ret == 1) {
537 cache->caching_ctl = NULL;
481 cache->cached = BTRFS_CACHE_FINISHED; 538 cache->cached = BTRFS_CACHE_FINISHED;
482 cache->last_byte_to_unpin = (u64)-1; 539 cache->last_byte_to_unpin = (u64)-1;
483 } else { 540 } else {
484 cache->cached = BTRFS_CACHE_NO; 541 if (load_cache_only) {
542 cache->caching_ctl = NULL;
543 cache->cached = BTRFS_CACHE_NO;
544 } else {
545 cache->cached = BTRFS_CACHE_STARTED;
546 }
485 } 547 }
486 spin_unlock(&cache->lock); 548 spin_unlock(&cache->lock);
549 wake_up(&caching_ctl->wait);
487 if (ret == 1) { 550 if (ret == 1) {
551 put_caching_control(caching_ctl);
488 free_excluded_extents(fs_info->extent_root, cache); 552 free_excluded_extents(fs_info->extent_root, cache);
489 return 0; 553 return 0;
490 } 554 }
555 } else {
556 /*
557 * We are not going to do the fast caching, set cached to the
558 * appropriate value and wakeup any waiters.
559 */
560 spin_lock(&cache->lock);
561 if (load_cache_only) {
562 cache->caching_ctl = NULL;
563 cache->cached = BTRFS_CACHE_NO;
564 } else {
565 cache->cached = BTRFS_CACHE_STARTED;
566 }
567 spin_unlock(&cache->lock);
568 wake_up(&caching_ctl->wait);
491 } 569 }
492 570
493 if (load_cache_only) 571 if (load_cache_only) {
494 return 0; 572 put_caching_control(caching_ctl);
495
496 caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
497 BUG_ON(!caching_ctl);
498
499 INIT_LIST_HEAD(&caching_ctl->list);
500 mutex_init(&caching_ctl->mutex);
501 init_waitqueue_head(&caching_ctl->wait);
502 caching_ctl->block_group = cache;
503 caching_ctl->progress = cache->key.objectid;
504 /* one for caching kthread, one for caching block group list */
505 atomic_set(&caching_ctl->count, 2);
506 caching_ctl->work.func = caching_thread;
507
508 spin_lock(&cache->lock);
509 if (cache->cached != BTRFS_CACHE_NO) {
510 spin_unlock(&cache->lock);
511 kfree(caching_ctl);
512 return 0; 573 return 0;
513 } 574 }
514 cache->caching_ctl = caching_ctl;
515 cache->cached = BTRFS_CACHE_STARTED;
516 spin_unlock(&cache->lock);
517 575
518 down_write(&fs_info->extent_commit_sem); 576 down_write(&fs_info->extent_commit_sem);
577 atomic_inc(&caching_ctl->count);
519 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); 578 list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
520 up_write(&fs_info->extent_commit_sem); 579 up_write(&fs_info->extent_commit_sem);
521 580
@@ -1770,18 +1829,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1770{ 1829{
1771 int ret; 1830 int ret;
1772 u64 discarded_bytes = 0; 1831 u64 discarded_bytes = 0;
1773 struct btrfs_multi_bio *multi = NULL; 1832 struct btrfs_bio *bbio = NULL;
1774 1833
1775 1834
1776 /* Tell the block device(s) that the sectors can be discarded */ 1835 /* Tell the block device(s) that the sectors can be discarded */
1777 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1836 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1778 bytenr, &num_bytes, &multi, 0); 1837 bytenr, &num_bytes, &bbio, 0);
1779 if (!ret) { 1838 if (!ret) {
1780 struct btrfs_bio_stripe *stripe = multi->stripes; 1839 struct btrfs_bio_stripe *stripe = bbio->stripes;
1781 int i; 1840 int i;
1782 1841
1783 1842
1784 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1843 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1785 if (!stripe->dev->can_discard) 1844 if (!stripe->dev->can_discard)
1786 continue; 1845 continue;
1787 1846
@@ -1800,7 +1859,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1800 */ 1859 */
1801 ret = 0; 1860 ret = 0;
1802 } 1861 }
1803 kfree(multi); 1862 kfree(bbio);
1804 } 1863 }
1805 1864
1806 if (actual_bytes) 1865 if (actual_bytes)
@@ -2700,6 +2759,13 @@ again:
2700 goto again; 2759 goto again;
2701 } 2760 }
2702 2761
2762 /* We've already setup this transaction, go ahead and exit */
2763 if (block_group->cache_generation == trans->transid &&
2764 i_size_read(inode)) {
2765 dcs = BTRFS_DC_SETUP;
2766 goto out_put;
2767 }
2768
2703 /* 2769 /*
2704 * We want to set the generation to 0, that way if anything goes wrong 2770 * We want to set the generation to 0, that way if anything goes wrong
2705 * from here on out we know not to trust this cache when we load up next 2771 * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2815,15 @@ again:
2749 if (!ret) 2815 if (!ret)
2750 dcs = BTRFS_DC_SETUP; 2816 dcs = BTRFS_DC_SETUP;
2751 btrfs_free_reserved_data_space(inode, num_pages); 2817 btrfs_free_reserved_data_space(inode, num_pages);
2818
2752out_put: 2819out_put:
2753 iput(inode); 2820 iput(inode);
2754out_free: 2821out_free:
2755 btrfs_release_path(path); 2822 btrfs_release_path(path);
2756out: 2823out:
2757 spin_lock(&block_group->lock); 2824 spin_lock(&block_group->lock);
2825 if (!ret)
2826 block_group->cache_generation = trans->transid;
2758 block_group->disk_cache_state = dcs; 2827 block_group->disk_cache_state = dcs;
2759 spin_unlock(&block_group->lock); 2828 spin_unlock(&block_group->lock);
2760 2829
@@ -3122,16 +3191,13 @@ commit_trans:
3122 return -ENOSPC; 3191 return -ENOSPC;
3123 } 3192 }
3124 data_sinfo->bytes_may_use += bytes; 3193 data_sinfo->bytes_may_use += bytes;
3125 BTRFS_I(inode)->reserved_bytes += bytes;
3126 spin_unlock(&data_sinfo->lock); 3194 spin_unlock(&data_sinfo->lock);
3127 3195
3128 return 0; 3196 return 0;
3129} 3197}
3130 3198
3131/* 3199/*
3132 * called when we are clearing an delalloc extent from the 3200 * Called if we need to clear a data reservation for this inode.
3133 * inode's io_tree or there was an error for whatever reason
3134 * after calling btrfs_check_data_free_space
3135 */ 3201 */
3136void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes) 3202void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3137{ 3203{
@@ -3144,7 +3210,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3144 data_sinfo = BTRFS_I(inode)->space_info; 3210 data_sinfo = BTRFS_I(inode)->space_info;
3145 spin_lock(&data_sinfo->lock); 3211 spin_lock(&data_sinfo->lock);
3146 data_sinfo->bytes_may_use -= bytes; 3212 data_sinfo->bytes_may_use -= bytes;
3147 BTRFS_I(inode)->reserved_bytes -= bytes;
3148 spin_unlock(&data_sinfo->lock); 3213 spin_unlock(&data_sinfo->lock);
3149} 3214}
3150 3215
@@ -3165,6 +3230,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3165 struct btrfs_space_info *sinfo, u64 alloc_bytes, 3230 struct btrfs_space_info *sinfo, u64 alloc_bytes,
3166 int force) 3231 int force)
3167{ 3232{
3233 struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3168 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly; 3234 u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3169 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved; 3235 u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3170 u64 thresh; 3236 u64 thresh;
@@ -3173,11 +3239,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
3173 return 1; 3239 return 1;
3174 3240
3175 /* 3241 /*
3242 * We need to take into account the global rsv because for all intents
3243 * and purposes it's used space. Don't worry about locking the
3244 * global_rsv, it doesn't change except when the transaction commits.
3245 */
3246 num_allocated += global_rsv->size;
3247
3248 /*
3176 * in limited mode, we want to have some free space up to 3249 * in limited mode, we want to have some free space up to
3177 * about 1% of the FS size. 3250 * about 1% of the FS size.
3178 */ 3251 */
3179 if (force == CHUNK_ALLOC_LIMITED) { 3252 if (force == CHUNK_ALLOC_LIMITED) {
3180 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3253 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3181 thresh = max_t(u64, 64 * 1024 * 1024, 3254 thresh = max_t(u64, 64 * 1024 * 1024,
3182 div_factor_fine(thresh, 1)); 3255 div_factor_fine(thresh, 1));
3183 3256
@@ -3199,7 +3272,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
3199 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8)) 3272 if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
3200 return 0; 3273 return 0;
3201 3274
3202 thresh = btrfs_super_total_bytes(&root->fs_info->super_copy); 3275 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3203 3276
3204 /* 256MB or 5% of the FS */ 3277 /* 256MB or 5% of the FS */
3205 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5)); 3278 thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3375,26 @@ out:
3302/* 3375/*
3303 * shrink metadata reservation for delalloc 3376 * shrink metadata reservation for delalloc
3304 */ 3377 */
3305static int shrink_delalloc(struct btrfs_trans_handle *trans, 3378static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
3306 struct btrfs_root *root, u64 to_reclaim, int sync) 3379 bool wait_ordered)
3307{ 3380{
3308 struct btrfs_block_rsv *block_rsv; 3381 struct btrfs_block_rsv *block_rsv;
3309 struct btrfs_space_info *space_info; 3382 struct btrfs_space_info *space_info;
3383 struct btrfs_trans_handle *trans;
3310 u64 reserved; 3384 u64 reserved;
3311 u64 max_reclaim; 3385 u64 max_reclaim;
3312 u64 reclaimed = 0; 3386 u64 reclaimed = 0;
3313 long time_left; 3387 long time_left;
3314 int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT; 3388 unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3315 int loops = 0; 3389 int loops = 0;
3316 unsigned long progress; 3390 unsigned long progress;
3317 3391
3392 trans = (struct btrfs_trans_handle *)current->journal_info;
3318 block_rsv = &root->fs_info->delalloc_block_rsv; 3393 block_rsv = &root->fs_info->delalloc_block_rsv;
3319 space_info = block_rsv->space_info; 3394 space_info = block_rsv->space_info;
3320 3395
3321 smp_mb(); 3396 smp_mb();
3322 reserved = space_info->bytes_reserved; 3397 reserved = space_info->bytes_may_use;
3323 progress = space_info->reservation_progress; 3398 progress = space_info->reservation_progress;
3324 3399
3325 if (reserved == 0) 3400 if (reserved == 0)
@@ -3334,18 +3409,20 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3334 } 3409 }
3335 3410
3336 max_reclaim = min(reserved, to_reclaim); 3411 max_reclaim = min(reserved, to_reclaim);
3337 3412 nr_pages = max_t(unsigned long, nr_pages,
3413 max_reclaim >> PAGE_CACHE_SHIFT);
3338 while (loops < 1024) { 3414 while (loops < 1024) {
3339 /* have the flusher threads jump in and do some IO */ 3415 /* have the flusher threads jump in and do some IO */
3340 smp_mb(); 3416 smp_mb();
3341 nr_pages = min_t(unsigned long, nr_pages, 3417 nr_pages = min_t(unsigned long, nr_pages,
3342 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT); 3418 root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
3343 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages); 3419 writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
3420 WB_REASON_FS_FREE_SPACE);
3344 3421
3345 spin_lock(&space_info->lock); 3422 spin_lock(&space_info->lock);
3346 if (reserved > space_info->bytes_reserved) 3423 if (reserved > space_info->bytes_may_use)
3347 reclaimed += reserved - space_info->bytes_reserved; 3424 reclaimed += reserved - space_info->bytes_may_use;
3348 reserved = space_info->bytes_reserved; 3425 reserved = space_info->bytes_may_use;
3349 spin_unlock(&space_info->lock); 3426 spin_unlock(&space_info->lock);
3350 3427
3351 loops++; 3428 loops++;
@@ -3356,11 +3433,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3356 if (trans && trans->transaction->blocked) 3433 if (trans && trans->transaction->blocked)
3357 return -EAGAIN; 3434 return -EAGAIN;
3358 3435
3359 time_left = schedule_timeout_interruptible(1); 3436 if (wait_ordered && !trans) {
3437 btrfs_wait_ordered_extents(root, 0, 0);
3438 } else {
3439 time_left = schedule_timeout_interruptible(1);
3360 3440
3361 /* We were interrupted, exit */ 3441 /* We were interrupted, exit */
3362 if (time_left) 3442 if (time_left)
3363 break; 3443 break;
3444 }
3364 3445
3365 /* we've kicked the IO a few times, if anything has been freed, 3446 /* we've kicked the IO a few times, if anything has been freed,
3366 * exit. There is no sense in looping here for a long time 3447 * exit. There is no sense in looping here for a long time
@@ -3375,34 +3456,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
3375 } 3456 }
3376 3457
3377 } 3458 }
3378 if (reclaimed >= to_reclaim && !trans) 3459
3379 btrfs_wait_ordered_extents(root, 0, 0);
3380 return reclaimed >= to_reclaim; 3460 return reclaimed >= to_reclaim;
3381} 3461}
3382 3462
3383/* 3463/**
3384 * Retries tells us how many times we've called reserve_metadata_bytes. The 3464 * maybe_commit_transaction - possibly commit the transaction if its ok to
3385 * idea is if this is the first call (retries == 0) then we will add to our 3465 * @root - the root we're allocating for
3386 * reserved count if we can't make the allocation in order to hold our place 3466 * @bytes - the number of bytes we want to reserve
3387 * while we go and try and free up space. That way for retries > 1 we don't try 3467 * @force - force the commit
3388 * and add space, we just check to see if the amount of unused space is >= the
3389 * total space, meaning that our reservation is valid.
3390 * 3468 *
3391 * However if we don't intend to retry this reservation, pass -1 as retries so 3469 * This will check to make sure that committing the transaction will actually
3392 * that it short circuits this logic. 3470 * get us somewhere and then commit the transaction if it does. Otherwise it
3471 * will return -ENOSPC.
3393 */ 3472 */
3394static int reserve_metadata_bytes(struct btrfs_trans_handle *trans, 3473static int may_commit_transaction(struct btrfs_root *root,
3395 struct btrfs_root *root, 3474 struct btrfs_space_info *space_info,
3475 u64 bytes, int force)
3476{
3477 struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3478 struct btrfs_trans_handle *trans;
3479
3480 trans = (struct btrfs_trans_handle *)current->journal_info;
3481 if (trans)
3482 return -EAGAIN;
3483
3484 if (force)
3485 goto commit;
3486
3487 /* See if there is enough pinned space to make this reservation */
3488 spin_lock(&space_info->lock);
3489 if (space_info->bytes_pinned >= bytes) {
3490 spin_unlock(&space_info->lock);
3491 goto commit;
3492 }
3493 spin_unlock(&space_info->lock);
3494
3495 /*
3496 * See if there is some space in the delayed insertion reservation for
3497 * this reservation.
3498 */
3499 if (space_info != delayed_rsv->space_info)
3500 return -ENOSPC;
3501
3502 spin_lock(&delayed_rsv->lock);
3503 if (delayed_rsv->size < bytes) {
3504 spin_unlock(&delayed_rsv->lock);
3505 return -ENOSPC;
3506 }
3507 spin_unlock(&delayed_rsv->lock);
3508
3509commit:
3510 trans = btrfs_join_transaction(root);
3511 if (IS_ERR(trans))
3512 return -ENOSPC;
3513
3514 return btrfs_commit_transaction(trans, root);
3515}
3516
3517/**
3518 * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3519 * @root - the root we're allocating for
3520 * @block_rsv - the block_rsv we're allocating for
3521 * @orig_bytes - the number of bytes we want
3522 * @flush - wether or not we can flush to make our reservation
3523 *
3524 * This will reserve orgi_bytes number of bytes from the space info associated
3525 * with the block_rsv. If there is not enough space it will make an attempt to
3526 * flush out space to make room. It will do this by flushing delalloc if
3527 * possible or committing the transaction. If flush is 0 then no attempts to
3528 * regain reservations will be made and this will fail if there is not enough
3529 * space already.
3530 */
3531static int reserve_metadata_bytes(struct btrfs_root *root,
3396 struct btrfs_block_rsv *block_rsv, 3532 struct btrfs_block_rsv *block_rsv,
3397 u64 orig_bytes, int flush) 3533 u64 orig_bytes, int flush)
3398{ 3534{
3399 struct btrfs_space_info *space_info = block_rsv->space_info; 3535 struct btrfs_space_info *space_info = block_rsv->space_info;
3400 u64 unused; 3536 u64 used;
3401 u64 num_bytes = orig_bytes; 3537 u64 num_bytes = orig_bytes;
3402 int retries = 0; 3538 int retries = 0;
3403 int ret = 0; 3539 int ret = 0;
3404 bool committed = false; 3540 bool committed = false;
3405 bool flushing = false; 3541 bool flushing = false;
3542 bool wait_ordered = false;
3406 3543
3407again: 3544again:
3408 ret = 0; 3545 ret = 0;
@@ -3419,7 +3556,7 @@ again:
3419 * deadlock since we are waiting for the flusher to finish, but 3556 * deadlock since we are waiting for the flusher to finish, but
3420 * hold the current transaction open. 3557 * hold the current transaction open.
3421 */ 3558 */
3422 if (trans) 3559 if (current->journal_info)
3423 return -EAGAIN; 3560 return -EAGAIN;
3424 ret = wait_event_interruptible(space_info->wait, 3561 ret = wait_event_interruptible(space_info->wait,
3425 !space_info->flush); 3562 !space_info->flush);
@@ -3431,9 +3568,9 @@ again:
3431 } 3568 }
3432 3569
3433 ret = -ENOSPC; 3570 ret = -ENOSPC;
3434 unused = space_info->bytes_used + space_info->bytes_reserved + 3571 used = space_info->bytes_used + space_info->bytes_reserved +
3435 space_info->bytes_pinned + space_info->bytes_readonly + 3572 space_info->bytes_pinned + space_info->bytes_readonly +
3436 space_info->bytes_may_use; 3573 space_info->bytes_may_use;
3437 3574
3438 /* 3575 /*
3439 * The idea here is that we've not already over-reserved the block group 3576 * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3579,9 @@ again:
3442 * lets start flushing stuff first and then come back and try to make 3579 * lets start flushing stuff first and then come back and try to make
3443 * our reservation. 3580 * our reservation.
3444 */ 3581 */
3445 if (unused <= space_info->total_bytes) { 3582 if (used <= space_info->total_bytes) {
3446 unused = space_info->total_bytes - unused; 3583 if (used + orig_bytes <= space_info->total_bytes) {
3447 if (unused >= num_bytes) { 3584 space_info->bytes_may_use += orig_bytes;
3448 space_info->bytes_reserved += orig_bytes;
3449 ret = 0; 3585 ret = 0;
3450 } else { 3586 } else {
3451 /* 3587 /*
@@ -3461,10 +3597,64 @@ again:
3461 * amount plus the amount of bytes that we need for this 3597 * amount plus the amount of bytes that we need for this
3462 * reservation. 3598 * reservation.
3463 */ 3599 */
3464 num_bytes = unused - space_info->total_bytes + 3600 wait_ordered = true;
3601 num_bytes = used - space_info->total_bytes +
3465 (orig_bytes * (retries + 1)); 3602 (orig_bytes * (retries + 1));
3466 } 3603 }
3467 3604
3605 if (ret) {
3606 u64 profile = btrfs_get_alloc_profile(root, 0);
3607 u64 avail;
3608
3609 /*
3610 * If we have a lot of space that's pinned, don't bother doing
3611 * the overcommit dance yet and just commit the transaction.
3612 */
3613 avail = (space_info->total_bytes - space_info->bytes_used) * 8;
3614 do_div(avail, 10);
3615 if (space_info->bytes_pinned >= avail && flush && !committed) {
3616 space_info->flush = 1;
3617 flushing = true;
3618 spin_unlock(&space_info->lock);
3619 ret = may_commit_transaction(root, space_info,
3620 orig_bytes, 1);
3621 if (ret)
3622 goto out;
3623 committed = true;
3624 goto again;
3625 }
3626
3627 spin_lock(&root->fs_info->free_chunk_lock);
3628 avail = root->fs_info->free_chunk_space;
3629
3630 /*
3631 * If we have dup, raid1 or raid10 then only half of the free
3632 * space is actually useable.
3633 */
3634 if (profile & (BTRFS_BLOCK_GROUP_DUP |
3635 BTRFS_BLOCK_GROUP_RAID1 |
3636 BTRFS_BLOCK_GROUP_RAID10))
3637 avail >>= 1;
3638
3639 /*
3640 * If we aren't flushing don't let us overcommit too much, say
3641 * 1/8th of the space. If we can flush, let it overcommit up to
3642 * 1/2 of the space.
3643 */
3644 if (flush)
3645 avail >>= 3;
3646 else
3647 avail >>= 1;
3648 spin_unlock(&root->fs_info->free_chunk_lock);
3649
3650 if (used + num_bytes < space_info->total_bytes + avail) {
3651 space_info->bytes_may_use += orig_bytes;
3652 ret = 0;
3653 } else {
3654 wait_ordered = true;
3655 }
3656 }
3657
3468 /* 3658 /*
3469 * Couldn't make our reservation, save our place so while we're trying 3659 * Couldn't make our reservation, save our place so while we're trying
3470 * to reclaim space we can actually use it instead of somebody else 3660 * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3674,7 @@ again:
3484 * We do synchronous shrinking since we don't actually unreserve 3674 * We do synchronous shrinking since we don't actually unreserve
3485 * metadata until after the IO is completed. 3675 * metadata until after the IO is completed.
3486 */ 3676 */
3487 ret = shrink_delalloc(trans, root, num_bytes, 1); 3677 ret = shrink_delalloc(root, num_bytes, wait_ordered);
3488 if (ret < 0) 3678 if (ret < 0)
3489 goto out; 3679 goto out;
3490 3680
@@ -3496,35 +3686,17 @@ again:
3496 * so go back around and try again. 3686 * so go back around and try again.
3497 */ 3687 */
3498 if (retries < 2) { 3688 if (retries < 2) {
3689 wait_ordered = true;
3499 retries++; 3690 retries++;
3500 goto again; 3691 goto again;
3501 } 3692 }
3502 3693
3503 /*
3504 * Not enough space to be reclaimed, don't bother committing the
3505 * transaction.
3506 */
3507 spin_lock(&space_info->lock);
3508 if (space_info->bytes_pinned < orig_bytes)
3509 ret = -ENOSPC;
3510 spin_unlock(&space_info->lock);
3511 if (ret)
3512 goto out;
3513
3514 ret = -EAGAIN;
3515 if (trans)
3516 goto out;
3517
3518 ret = -ENOSPC; 3694 ret = -ENOSPC;
3519 if (committed) 3695 if (committed)
3520 goto out; 3696 goto out;
3521 3697
3522 trans = btrfs_join_transaction(root); 3698 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3523 if (IS_ERR(trans))
3524 goto out;
3525 ret = btrfs_commit_transaction(trans, root);
3526 if (!ret) { 3699 if (!ret) {
3527 trans = NULL;
3528 committed = true; 3700 committed = true;
3529 goto again; 3701 goto again;
3530 } 3702 }
@@ -3542,10 +3714,12 @@ out:
3542static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans, 3714static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
3543 struct btrfs_root *root) 3715 struct btrfs_root *root)
3544{ 3716{
3545 struct btrfs_block_rsv *block_rsv; 3717 struct btrfs_block_rsv *block_rsv = NULL;
3546 if (root->ref_cows) 3718
3719 if (root->ref_cows || root == root->fs_info->csum_root)
3547 block_rsv = trans->block_rsv; 3720 block_rsv = trans->block_rsv;
3548 else 3721
3722 if (!block_rsv)
3549 block_rsv = root->block_rsv; 3723 block_rsv = root->block_rsv;
3550 3724
3551 if (!block_rsv) 3725 if (!block_rsv)
@@ -3616,7 +3790,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
3616 } 3790 }
3617 if (num_bytes) { 3791 if (num_bytes) {
3618 spin_lock(&space_info->lock); 3792 spin_lock(&space_info->lock);
3619 space_info->bytes_reserved -= num_bytes; 3793 space_info->bytes_may_use -= num_bytes;
3620 space_info->reservation_progress++; 3794 space_info->reservation_progress++;
3621 spin_unlock(&space_info->lock); 3795 spin_unlock(&space_info->lock);
3622 } 3796 }
@@ -3640,9 +3814,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
3640{ 3814{
3641 memset(rsv, 0, sizeof(*rsv)); 3815 memset(rsv, 0, sizeof(*rsv));
3642 spin_lock_init(&rsv->lock); 3816 spin_lock_init(&rsv->lock);
3643 atomic_set(&rsv->usage, 1);
3644 rsv->priority = 6;
3645 INIT_LIST_HEAD(&rsv->list);
3646} 3817}
3647 3818
3648struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root) 3819struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3834,20 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
3663void btrfs_free_block_rsv(struct btrfs_root *root, 3834void btrfs_free_block_rsv(struct btrfs_root *root,
3664 struct btrfs_block_rsv *rsv) 3835 struct btrfs_block_rsv *rsv)
3665{ 3836{
3666 if (rsv && atomic_dec_and_test(&rsv->usage)) { 3837 btrfs_block_rsv_release(root, rsv, (u64)-1);
3667 btrfs_block_rsv_release(root, rsv, (u64)-1); 3838 kfree(rsv);
3668 if (!rsv->durable)
3669 kfree(rsv);
3670 }
3671} 3839}
3672 3840
3673/* 3841static inline int __block_rsv_add(struct btrfs_root *root,
3674 * make the block_rsv struct be able to capture freed space. 3842 struct btrfs_block_rsv *block_rsv,
3675 * the captured space will re-add to the the block_rsv struct 3843 u64 num_bytes, int flush)
3676 * after transaction commit
3677 */
3678void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
3679 struct btrfs_block_rsv *block_rsv)
3680{
3681 block_rsv->durable = 1;
3682 mutex_lock(&fs_info->durable_block_rsv_mutex);
3683 list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
3684 mutex_unlock(&fs_info->durable_block_rsv_mutex);
3685}
3686
3687int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3688 struct btrfs_root *root,
3689 struct btrfs_block_rsv *block_rsv,
3690 u64 num_bytes)
3691{ 3844{
3692 int ret; 3845 int ret;
3693 3846
3694 if (num_bytes == 0) 3847 if (num_bytes == 0)
3695 return 0; 3848 return 0;
3696 3849
3697 ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1); 3850 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3698 if (!ret) { 3851 if (!ret) {
3699 block_rsv_add_bytes(block_rsv, num_bytes, 1); 3852 block_rsv_add_bytes(block_rsv, num_bytes, 1);
3700 return 0; 3853 return 0;
@@ -3703,55 +3856,80 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
3703 return ret; 3856 return ret;
3704} 3857}
3705 3858
3706int btrfs_block_rsv_check(struct btrfs_trans_handle *trans, 3859int btrfs_block_rsv_add(struct btrfs_root *root,
3707 struct btrfs_root *root, 3860 struct btrfs_block_rsv *block_rsv,
3708 struct btrfs_block_rsv *block_rsv, 3861 u64 num_bytes)
3709 u64 min_reserved, int min_factor) 3862{
3863 return __block_rsv_add(root, block_rsv, num_bytes, 1);
3864}
3865
3866int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
3867 struct btrfs_block_rsv *block_rsv,
3868 u64 num_bytes)
3869{
3870 return __block_rsv_add(root, block_rsv, num_bytes, 0);
3871}
3872
3873int btrfs_block_rsv_check(struct btrfs_root *root,
3874 struct btrfs_block_rsv *block_rsv, int min_factor)
3710{ 3875{
3711 u64 num_bytes = 0; 3876 u64 num_bytes = 0;
3712 int commit_trans = 0;
3713 int ret = -ENOSPC; 3877 int ret = -ENOSPC;
3714 3878
3715 if (!block_rsv) 3879 if (!block_rsv)
3716 return 0; 3880 return 0;
3717 3881
3718 spin_lock(&block_rsv->lock); 3882 spin_lock(&block_rsv->lock);
3719 if (min_factor > 0) 3883 num_bytes = div_factor(block_rsv->size, min_factor);
3720 num_bytes = div_factor(block_rsv->size, min_factor); 3884 if (block_rsv->reserved >= num_bytes)
3721 if (min_reserved > num_bytes) 3885 ret = 0;
3722 num_bytes = min_reserved; 3886 spin_unlock(&block_rsv->lock);
3723 3887
3724 if (block_rsv->reserved >= num_bytes) { 3888 return ret;
3889}
3890
3891static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
3892 struct btrfs_block_rsv *block_rsv,
3893 u64 min_reserved, int flush)
3894{
3895 u64 num_bytes = 0;
3896 int ret = -ENOSPC;
3897
3898 if (!block_rsv)
3899 return 0;
3900
3901 spin_lock(&block_rsv->lock);
3902 num_bytes = min_reserved;
3903 if (block_rsv->reserved >= num_bytes)
3725 ret = 0; 3904 ret = 0;
3726 } else { 3905 else
3727 num_bytes -= block_rsv->reserved; 3906 num_bytes -= block_rsv->reserved;
3728 if (block_rsv->durable &&
3729 block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
3730 commit_trans = 1;
3731 }
3732 spin_unlock(&block_rsv->lock); 3907 spin_unlock(&block_rsv->lock);
3908
3733 if (!ret) 3909 if (!ret)
3734 return 0; 3910 return 0;
3735 3911
3736 if (block_rsv->refill_used) { 3912 ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
3737 ret = reserve_metadata_bytes(trans, root, block_rsv, 3913 if (!ret) {
3738 num_bytes, 0); 3914 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3739 if (!ret) {
3740 block_rsv_add_bytes(block_rsv, num_bytes, 0);
3741 return 0;
3742 }
3743 }
3744
3745 if (commit_trans) {
3746 if (trans)
3747 return -EAGAIN;
3748 trans = btrfs_join_transaction(root);
3749 BUG_ON(IS_ERR(trans));
3750 ret = btrfs_commit_transaction(trans, root);
3751 return 0; 3915 return 0;
3752 } 3916 }
3753 3917
3754 return -ENOSPC; 3918 return ret;
3919}
3920
3921int btrfs_block_rsv_refill(struct btrfs_root *root,
3922 struct btrfs_block_rsv *block_rsv,
3923 u64 min_reserved)
3924{
3925 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
3926}
3927
3928int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
3929 struct btrfs_block_rsv *block_rsv,
3930 u64 min_reserved)
3931{
3932 return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
3755} 3933}
3756 3934
3757int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, 3935int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3961,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
3783 u64 num_bytes; 3961 u64 num_bytes;
3784 u64 meta_used; 3962 u64 meta_used;
3785 u64 data_used; 3963 u64 data_used;
3786 int csum_size = btrfs_super_csum_size(&fs_info->super_copy); 3964 int csum_size = btrfs_super_csum_size(fs_info->super_copy);
3787 3965
3788 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA); 3966 sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
3789 spin_lock(&sinfo->lock); 3967 spin_lock(&sinfo->lock);
@@ -3827,12 +4005,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
3827 if (sinfo->total_bytes > num_bytes) { 4005 if (sinfo->total_bytes > num_bytes) {
3828 num_bytes = sinfo->total_bytes - num_bytes; 4006 num_bytes = sinfo->total_bytes - num_bytes;
3829 block_rsv->reserved += num_bytes; 4007 block_rsv->reserved += num_bytes;
3830 sinfo->bytes_reserved += num_bytes; 4008 sinfo->bytes_may_use += num_bytes;
3831 } 4009 }
3832 4010
3833 if (block_rsv->reserved >= block_rsv->size) { 4011 if (block_rsv->reserved >= block_rsv->size) {
3834 num_bytes = block_rsv->reserved - block_rsv->size; 4012 num_bytes = block_rsv->reserved - block_rsv->size;
3835 sinfo->bytes_reserved -= num_bytes; 4013 sinfo->bytes_may_use -= num_bytes;
3836 sinfo->reservation_progress++; 4014 sinfo->reservation_progress++;
3837 block_rsv->reserved = block_rsv->size; 4015 block_rsv->reserved = block_rsv->size;
3838 block_rsv->full = 1; 4016 block_rsv->full = 1;
@@ -3848,16 +4026,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3848 4026
3849 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); 4027 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3850 fs_info->chunk_block_rsv.space_info = space_info; 4028 fs_info->chunk_block_rsv.space_info = space_info;
3851 fs_info->chunk_block_rsv.priority = 10;
3852 4029
3853 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); 4030 space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
3854 fs_info->global_block_rsv.space_info = space_info; 4031 fs_info->global_block_rsv.space_info = space_info;
3855 fs_info->global_block_rsv.priority = 10;
3856 fs_info->global_block_rsv.refill_used = 1;
3857 fs_info->delalloc_block_rsv.space_info = space_info; 4032 fs_info->delalloc_block_rsv.space_info = space_info;
3858 fs_info->trans_block_rsv.space_info = space_info; 4033 fs_info->trans_block_rsv.space_info = space_info;
3859 fs_info->empty_block_rsv.space_info = space_info; 4034 fs_info->empty_block_rsv.space_info = space_info;
3860 fs_info->empty_block_rsv.priority = 10; 4035 fs_info->delayed_block_rsv.space_info = space_info;
3861 4036
3862 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv; 4037 fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
3863 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv; 4038 fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +4040,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
3865 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv; 4040 fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
3866 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv; 4041 fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
3867 4042
3868 btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
3869
3870 btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
3871
3872 update_global_block_rsv(fs_info); 4043 update_global_block_rsv(fs_info);
3873} 4044}
3874 4045
@@ -3881,37 +4052,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
3881 WARN_ON(fs_info->trans_block_rsv.reserved > 0); 4052 WARN_ON(fs_info->trans_block_rsv.reserved > 0);
3882 WARN_ON(fs_info->chunk_block_rsv.size > 0); 4053 WARN_ON(fs_info->chunk_block_rsv.size > 0);
3883 WARN_ON(fs_info->chunk_block_rsv.reserved > 0); 4054 WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
3884} 4055 WARN_ON(fs_info->delayed_block_rsv.size > 0);
3885 4056 WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
3886int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
3887 struct btrfs_root *root,
3888 struct btrfs_block_rsv *rsv)
3889{
3890 struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
3891 u64 num_bytes;
3892 int ret;
3893
3894 /*
3895 * Truncate should be freeing data, but give us 2 items just in case it
3896 * needs to use some space. We may want to be smarter about this in the
3897 * future.
3898 */
3899 num_bytes = btrfs_calc_trans_metadata_size(root, 2);
3900
3901 /* We already have enough bytes, just return */
3902 if (rsv->reserved >= num_bytes)
3903 return 0;
3904
3905 num_bytes -= rsv->reserved;
3906
3907 /*
3908 * You should have reserved enough space before hand to do this, so this
3909 * should not fail.
3910 */
3911 ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
3912 BUG_ON(ret);
3913
3914 return 0;
3915} 4057}
3916 4058
3917void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, 4059void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4062,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
3920 if (!trans->bytes_reserved) 4062 if (!trans->bytes_reserved)
3921 return; 4063 return;
3922 4064
3923 BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv); 4065 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
3924 btrfs_block_rsv_release(root, trans->block_rsv,
3925 trans->bytes_reserved);
3926 trans->bytes_reserved = 0; 4066 trans->bytes_reserved = 0;
3927} 4067}
3928 4068
@@ -3964,33 +4104,99 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
3964 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes); 4104 return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
3965} 4105}
3966 4106
4107/**
4108 * drop_outstanding_extent - drop an outstanding extent
4109 * @inode: the inode we're dropping the extent for
4110 *
4111 * This is called when we are freeing up an outstanding extent, either called
4112 * after an error or after an extent is written. This will return the number of
4113 * reserved extents that need to be freed. This must be called with
4114 * BTRFS_I(inode)->lock held.
4115 */
3967static unsigned drop_outstanding_extent(struct inode *inode) 4116static unsigned drop_outstanding_extent(struct inode *inode)
3968{ 4117{
4118 unsigned drop_inode_space = 0;
3969 unsigned dropped_extents = 0; 4119 unsigned dropped_extents = 0;
3970 4120
3971 spin_lock(&BTRFS_I(inode)->lock);
3972 BUG_ON(!BTRFS_I(inode)->outstanding_extents); 4121 BUG_ON(!BTRFS_I(inode)->outstanding_extents);
3973 BTRFS_I(inode)->outstanding_extents--; 4122 BTRFS_I(inode)->outstanding_extents--;
3974 4123
4124 if (BTRFS_I(inode)->outstanding_extents == 0 &&
4125 BTRFS_I(inode)->delalloc_meta_reserved) {
4126 drop_inode_space = 1;
4127 BTRFS_I(inode)->delalloc_meta_reserved = 0;
4128 }
4129
3975 /* 4130 /*
3976 * If we have more or the same amount of outsanding extents than we have 4131 * If we have more or the same amount of outsanding extents than we have
3977 * reserved then we need to leave the reserved extents count alone. 4132 * reserved then we need to leave the reserved extents count alone.
3978 */ 4133 */
3979 if (BTRFS_I(inode)->outstanding_extents >= 4134 if (BTRFS_I(inode)->outstanding_extents >=
3980 BTRFS_I(inode)->reserved_extents) 4135 BTRFS_I(inode)->reserved_extents)
3981 goto out; 4136 return drop_inode_space;
3982 4137
3983 dropped_extents = BTRFS_I(inode)->reserved_extents - 4138 dropped_extents = BTRFS_I(inode)->reserved_extents -
3984 BTRFS_I(inode)->outstanding_extents; 4139 BTRFS_I(inode)->outstanding_extents;
3985 BTRFS_I(inode)->reserved_extents -= dropped_extents; 4140 BTRFS_I(inode)->reserved_extents -= dropped_extents;
3986out: 4141 return dropped_extents + drop_inode_space;
3987 spin_unlock(&BTRFS_I(inode)->lock);
3988 return dropped_extents;
3989} 4142}
3990 4143
3991static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes) 4144/**
4145 * calc_csum_metadata_size - return the amount of metada space that must be
4146 * reserved/free'd for the given bytes.
4147 * @inode: the inode we're manipulating
4148 * @num_bytes: the number of bytes in question
4149 * @reserve: 1 if we are reserving space, 0 if we are freeing space
4150 *
4151 * This adjusts the number of csum_bytes in the inode and then returns the
4152 * correct amount of metadata that must either be reserved or freed. We
4153 * calculate how many checksums we can fit into one leaf and then divide the
4154 * number of bytes that will need to be checksumed by this value to figure out
4155 * how many checksums will be required. If we are adding bytes then the number
4156 * may go up and we will return the number of additional bytes that must be
4157 * reserved. If it is going down we will return the number of bytes that must
4158 * be freed.
4159 *
4160 * This must be called with BTRFS_I(inode)->lock held.
4161 */
4162static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4163 int reserve)
3992{ 4164{
3993 return num_bytes >>= 3; 4165 struct btrfs_root *root = BTRFS_I(inode)->root;
4166 u64 csum_size;
4167 int num_csums_per_leaf;
4168 int num_csums;
4169 int old_csums;
4170
4171 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4172 BTRFS_I(inode)->csum_bytes == 0)
4173 return 0;
4174
4175 old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4176 if (reserve)
4177 BTRFS_I(inode)->csum_bytes += num_bytes;
4178 else
4179 BTRFS_I(inode)->csum_bytes -= num_bytes;
4180 csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4181 num_csums_per_leaf = (int)div64_u64(csum_size,
4182 sizeof(struct btrfs_csum_item) +
4183 sizeof(struct btrfs_disk_key));
4184 num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4185 num_csums = num_csums + num_csums_per_leaf - 1;
4186 num_csums = num_csums / num_csums_per_leaf;
4187
4188 old_csums = old_csums + num_csums_per_leaf - 1;
4189 old_csums = old_csums / num_csums_per_leaf;
4190
4191 /* No change, no need to reserve more */
4192 if (old_csums == num_csums)
4193 return 0;
4194
4195 if (reserve)
4196 return btrfs_calc_trans_metadata_size(root,
4197 num_csums - old_csums);
4198
4199 return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
3994} 4200}
3995 4201
3996int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) 4202int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4205,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
3999 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv; 4205 struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4000 u64 to_reserve = 0; 4206 u64 to_reserve = 0;
4001 unsigned nr_extents = 0; 4207 unsigned nr_extents = 0;
4208 int flush = 1;
4002 int ret; 4209 int ret;
4003 4210
4004 if (btrfs_transaction_in_commit(root->fs_info)) 4211 if (btrfs_is_free_space_inode(root, inode))
4212 flush = 0;
4213
4214 if (flush && btrfs_transaction_in_commit(root->fs_info))
4005 schedule_timeout(1); 4215 schedule_timeout(1);
4006 4216
4007 num_bytes = ALIGN(num_bytes, root->sectorsize); 4217 num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4014,21 +4224,41 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4014 nr_extents = BTRFS_I(inode)->outstanding_extents - 4224 nr_extents = BTRFS_I(inode)->outstanding_extents -
4015 BTRFS_I(inode)->reserved_extents; 4225 BTRFS_I(inode)->reserved_extents;
4016 BTRFS_I(inode)->reserved_extents += nr_extents; 4226 BTRFS_I(inode)->reserved_extents += nr_extents;
4227 }
4017 4228
4018 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents); 4229 /*
4230 * Add an item to reserve for updating the inode when we complete the
4231 * delalloc io.
4232 */
4233 if (!BTRFS_I(inode)->delalloc_meta_reserved) {
4234 nr_extents++;
4235 BTRFS_I(inode)->delalloc_meta_reserved = 1;
4019 } 4236 }
4237
4238 to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4239 to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4020 spin_unlock(&BTRFS_I(inode)->lock); 4240 spin_unlock(&BTRFS_I(inode)->lock);
4021 4241
4022 to_reserve += calc_csum_metadata_size(inode, num_bytes); 4242 ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4023 ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
4024 if (ret) { 4243 if (ret) {
4244 u64 to_free = 0;
4025 unsigned dropped; 4245 unsigned dropped;
4246
4247 spin_lock(&BTRFS_I(inode)->lock);
4248 dropped = drop_outstanding_extent(inode);
4249 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4250 spin_unlock(&BTRFS_I(inode)->lock);
4251 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4252
4026 /* 4253 /*
4027 * We don't need the return value since our reservation failed, 4254 * Somebody could have come in and twiddled with the
4028 * we just need to clean up our counter. 4255 * reservation, so if we have to free more than we would have
4256 * reserved from this reservation go ahead and release those
4257 * bytes.
4029 */ 4258 */
4030 dropped = drop_outstanding_extent(inode); 4259 to_free -= to_reserve;
4031 WARN_ON(dropped > 1); 4260 if (to_free)
4261 btrfs_block_rsv_release(root, block_rsv, to_free);
4032 return ret; 4262 return ret;
4033 } 4263 }
4034 4264
@@ -4037,6 +4267,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4037 return 0; 4267 return 0;
4038} 4268}
4039 4269
4270/**
4271 * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4272 * @inode: the inode to release the reservation for
4273 * @num_bytes: the number of bytes we're releasing
4274 *
4275 * This will release the metadata reservation for an inode. This can be called
4276 * once we complete IO for a given set of bytes to release their metadata
4277 * reservations.
4278 */
4040void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) 4279void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4041{ 4280{
4042 struct btrfs_root *root = BTRFS_I(inode)->root; 4281 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4283,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4044 unsigned dropped; 4283 unsigned dropped;
4045 4284
4046 num_bytes = ALIGN(num_bytes, root->sectorsize); 4285 num_bytes = ALIGN(num_bytes, root->sectorsize);
4286 spin_lock(&BTRFS_I(inode)->lock);
4047 dropped = drop_outstanding_extent(inode); 4287 dropped = drop_outstanding_extent(inode);
4048 4288
4049 to_free = calc_csum_metadata_size(inode, num_bytes); 4289 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4290 spin_unlock(&BTRFS_I(inode)->lock);
4050 if (dropped > 0) 4291 if (dropped > 0)
4051 to_free += btrfs_calc_trans_metadata_size(root, dropped); 4292 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4052 4293
@@ -4054,6 +4295,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4054 to_free); 4295 to_free);
4055} 4296}
4056 4297
4298/**
4299 * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4300 * @inode: inode we're writing to
4301 * @num_bytes: the number of bytes we want to allocate
4302 *
4303 * This will do the following things
4304 *
4305 * o reserve space in the data space info for num_bytes
4306 * o reserve space in the metadata space info based on number of outstanding
4307 * extents and how much csums will be needed
4308 * o add to the inodes ->delalloc_bytes
4309 * o add it to the fs_info's delalloc inodes list.
4310 *
4311 * This will return 0 for success and -ENOSPC if there is no space left.
4312 */
4057int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes) 4313int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4058{ 4314{
4059 int ret; 4315 int ret;
@@ -4071,6 +4327,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4071 return 0; 4327 return 0;
4072} 4328}
4073 4329
4330/**
4331 * btrfs_delalloc_release_space - release data and metadata space for delalloc
4332 * @inode: inode we're releasing space for
4333 * @num_bytes: the number of bytes we want to free up
4334 *
4335 * This must be matched with a call to btrfs_delalloc_reserve_space. This is
4336 * called in the case that we don't need the metadata AND data reservations
4337 * anymore. So if there is an error or we insert an inline extent.
4338 *
4339 * This function will release the metadata space that was not used and will
4340 * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4341 * list if there are no delalloc bytes left.
4342 */
4074void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes) 4343void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4075{ 4344{
4076 btrfs_delalloc_release_metadata(inode, num_bytes); 4345 btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4359,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4090 4359
4091 /* block accounting for super block */ 4360 /* block accounting for super block */
4092 spin_lock(&info->delalloc_lock); 4361 spin_lock(&info->delalloc_lock);
4093 old_val = btrfs_super_bytes_used(&info->super_copy); 4362 old_val = btrfs_super_bytes_used(info->super_copy);
4094 if (alloc) 4363 if (alloc)
4095 old_val += num_bytes; 4364 old_val += num_bytes;
4096 else 4365 else
4097 old_val -= num_bytes; 4366 old_val -= num_bytes;
4098 btrfs_set_super_bytes_used(&info->super_copy, old_val); 4367 btrfs_set_super_bytes_used(info->super_copy, old_val);
4099 spin_unlock(&info->delalloc_lock); 4368 spin_unlock(&info->delalloc_lock);
4100 4369
4101 while (total) { 4370 while (total) {
@@ -4123,7 +4392,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4123 spin_lock(&cache->space_info->lock); 4392 spin_lock(&cache->space_info->lock);
4124 spin_lock(&cache->lock); 4393 spin_lock(&cache->lock);
4125 4394
4126 if (btrfs_super_cache_generation(&info->super_copy) != 0 && 4395 if (btrfs_test_opt(root, SPACE_CACHE) &&
4127 cache->disk_cache_state < BTRFS_DC_CLEAR) 4396 cache->disk_cache_state < BTRFS_DC_CLEAR)
4128 cache->disk_cache_state = BTRFS_DC_CLEAR; 4397 cache->disk_cache_state = BTRFS_DC_CLEAR;
4129 4398
@@ -4135,7 +4404,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
4135 btrfs_set_block_group_used(&cache->item, old_val); 4404 btrfs_set_block_group_used(&cache->item, old_val);
4136 cache->reserved -= num_bytes; 4405 cache->reserved -= num_bytes;
4137 cache->space_info->bytes_reserved -= num_bytes; 4406 cache->space_info->bytes_reserved -= num_bytes;
4138 cache->space_info->reservation_progress++;
4139 cache->space_info->bytes_used += num_bytes; 4407 cache->space_info->bytes_used += num_bytes;
4140 cache->space_info->disk_used += num_bytes * factor; 4408 cache->space_info->disk_used += num_bytes * factor;
4141 spin_unlock(&cache->lock); 4409 spin_unlock(&cache->lock);
@@ -4187,7 +4455,6 @@ static int pin_down_extent(struct btrfs_root *root,
4187 if (reserved) { 4455 if (reserved) {
4188 cache->reserved -= num_bytes; 4456 cache->reserved -= num_bytes;
4189 cache->space_info->bytes_reserved -= num_bytes; 4457 cache->space_info->bytes_reserved -= num_bytes;
4190 cache->space_info->reservation_progress++;
4191 } 4458 }
4192 spin_unlock(&cache->lock); 4459 spin_unlock(&cache->lock);
4193 spin_unlock(&cache->space_info->lock); 4460 spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4482,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
4215} 4482}
4216 4483
4217/* 4484/*
4218 * update size of reserved extents. this function may return -EAGAIN 4485 * this function must be called within transaction
4219 * if 'reserve' is true or 'sinfo' is false.
4220 */ 4486 */
4221int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache, 4487int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4222 u64 num_bytes, int reserve, int sinfo) 4488 struct btrfs_root *root,
4489 u64 bytenr, u64 num_bytes)
4223{ 4490{
4491 struct btrfs_block_group_cache *cache;
4492
4493 cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4494 BUG_ON(!cache);
4495
4496 /*
4497 * pull in the free space cache (if any) so that our pin
4498 * removes the free space from the cache. We have load_only set
4499 * to one because the slow code to read in the free extents does check
4500 * the pinned extents.
4501 */
4502 cache_block_group(cache, trans, root, 1);
4503
4504 pin_down_extent(root, cache, bytenr, num_bytes, 0);
4505
4506 /* remove us from the free space cache (if we're there at all) */
4507 btrfs_remove_free_space(cache, bytenr, num_bytes);
4508 btrfs_put_block_group(cache);
4509 return 0;
4510}
4511
4512/**
4513 * btrfs_update_reserved_bytes - update the block_group and space info counters
4514 * @cache: The cache we are manipulating
4515 * @num_bytes: The number of bytes in question
4516 * @reserve: One of the reservation enums
4517 *
4518 * This is called by the allocator when it reserves space, or by somebody who is
4519 * freeing space that was never actually used on disk. For example if you
4520 * reserve some space for a new leaf in transaction A and before transaction A
4521 * commits you free that leaf, you call this with reserve set to 0 in order to
4522 * clear the reservation.
4523 *
4524 * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4525 * ENOSPC accounting. For data we handle the reservation through clearing the
4526 * delalloc bits in the io_tree. We have to do this since we could end up
4527 * allocating less disk space for the amount of data we have reserved in the
4528 * case of compression.
4529 *
4530 * If this is a reservation and the block group has become read only we cannot
4531 * make the reservation and return -EAGAIN, otherwise this function always
4532 * succeeds.
4533 */
4534static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4535 u64 num_bytes, int reserve)
4536{
4537 struct btrfs_space_info *space_info = cache->space_info;
4224 int ret = 0; 4538 int ret = 0;
4225 if (sinfo) { 4539 spin_lock(&space_info->lock);
4226 struct btrfs_space_info *space_info = cache->space_info; 4540 spin_lock(&cache->lock);
4227 spin_lock(&space_info->lock); 4541 if (reserve != RESERVE_FREE) {
4228 spin_lock(&cache->lock);
4229 if (reserve) {
4230 if (cache->ro) {
4231 ret = -EAGAIN;
4232 } else {
4233 cache->reserved += num_bytes;
4234 space_info->bytes_reserved += num_bytes;
4235 }
4236 } else {
4237 if (cache->ro)
4238 space_info->bytes_readonly += num_bytes;
4239 cache->reserved -= num_bytes;
4240 space_info->bytes_reserved -= num_bytes;
4241 space_info->reservation_progress++;
4242 }
4243 spin_unlock(&cache->lock);
4244 spin_unlock(&space_info->lock);
4245 } else {
4246 spin_lock(&cache->lock);
4247 if (cache->ro) { 4542 if (cache->ro) {
4248 ret = -EAGAIN; 4543 ret = -EAGAIN;
4249 } else { 4544 } else {
4250 if (reserve) 4545 cache->reserved += num_bytes;
4251 cache->reserved += num_bytes; 4546 space_info->bytes_reserved += num_bytes;
4252 else 4547 if (reserve == RESERVE_ALLOC) {
4253 cache->reserved -= num_bytes; 4548 BUG_ON(space_info->bytes_may_use < num_bytes);
4549 space_info->bytes_may_use -= num_bytes;
4550 }
4254 } 4551 }
4255 spin_unlock(&cache->lock); 4552 } else {
4553 if (cache->ro)
4554 space_info->bytes_readonly += num_bytes;
4555 cache->reserved -= num_bytes;
4556 space_info->bytes_reserved -= num_bytes;
4557 space_info->reservation_progress++;
4256 } 4558 }
4559 spin_unlock(&cache->lock);
4560 spin_unlock(&space_info->lock);
4257 return ret; 4561 return ret;
4258} 4562}
4259 4563
@@ -4319,13 +4623,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
4319 spin_lock(&cache->lock); 4623 spin_lock(&cache->lock);
4320 cache->pinned -= len; 4624 cache->pinned -= len;
4321 cache->space_info->bytes_pinned -= len; 4625 cache->space_info->bytes_pinned -= len;
4322 if (cache->ro) { 4626 if (cache->ro)
4323 cache->space_info->bytes_readonly += len; 4627 cache->space_info->bytes_readonly += len;
4324 } else if (cache->reserved_pinned > 0) {
4325 len = min(len, cache->reserved_pinned);
4326 cache->reserved_pinned -= len;
4327 cache->space_info->bytes_reserved += len;
4328 }
4329 spin_unlock(&cache->lock); 4628 spin_unlock(&cache->lock);
4330 spin_unlock(&cache->space_info->lock); 4629 spin_unlock(&cache->space_info->lock);
4331 } 4630 }
@@ -4340,11 +4639,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4340{ 4639{
4341 struct btrfs_fs_info *fs_info = root->fs_info; 4640 struct btrfs_fs_info *fs_info = root->fs_info;
4342 struct extent_io_tree *unpin; 4641 struct extent_io_tree *unpin;
4343 struct btrfs_block_rsv *block_rsv;
4344 struct btrfs_block_rsv *next_rsv;
4345 u64 start; 4642 u64 start;
4346 u64 end; 4643 u64 end;
4347 int idx;
4348 int ret; 4644 int ret;
4349 4645
4350 if (fs_info->pinned_extents == &fs_info->freed_extents[0]) 4646 if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4663,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
4367 cond_resched(); 4663 cond_resched();
4368 } 4664 }
4369 4665
4370 mutex_lock(&fs_info->durable_block_rsv_mutex);
4371 list_for_each_entry_safe(block_rsv, next_rsv,
4372 &fs_info->durable_block_rsv_list, list) {
4373
4374 idx = trans->transid & 0x1;
4375 if (block_rsv->freed[idx] > 0) {
4376 block_rsv_add_bytes(block_rsv,
4377 block_rsv->freed[idx], 0);
4378 block_rsv->freed[idx] = 0;
4379 }
4380 if (atomic_read(&block_rsv->usage) == 0) {
4381 btrfs_block_rsv_release(root, block_rsv, (u64)-1);
4382
4383 if (block_rsv->freed[0] == 0 &&
4384 block_rsv->freed[1] == 0) {
4385 list_del_init(&block_rsv->list);
4386 kfree(block_rsv);
4387 }
4388 } else {
4389 btrfs_block_rsv_release(root, block_rsv, 0);
4390 }
4391 }
4392 mutex_unlock(&fs_info->durable_block_rsv_mutex);
4393
4394 return 0; 4666 return 0;
4395} 4667}
4396 4668
@@ -4668,7 +4940,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4668 struct extent_buffer *buf, 4940 struct extent_buffer *buf,
4669 u64 parent, int last_ref) 4941 u64 parent, int last_ref)
4670{ 4942{
4671 struct btrfs_block_rsv *block_rsv;
4672 struct btrfs_block_group_cache *cache = NULL; 4943 struct btrfs_block_group_cache *cache = NULL;
4673 int ret; 4944 int ret;
4674 4945
@@ -4683,64 +4954,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
4683 if (!last_ref) 4954 if (!last_ref)
4684 return; 4955 return;
4685 4956
4686 block_rsv = get_block_rsv(trans, root);
4687 cache = btrfs_lookup_block_group(root->fs_info, buf->start); 4957 cache = btrfs_lookup_block_group(root->fs_info, buf->start);
4688 if (block_rsv->space_info != cache->space_info)
4689 goto out;
4690 4958
4691 if (btrfs_header_generation(buf) == trans->transid) { 4959 if (btrfs_header_generation(buf) == trans->transid) {
4692 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) { 4960 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4693 ret = check_ref_cleanup(trans, root, buf->start); 4961 ret = check_ref_cleanup(trans, root, buf->start);
4694 if (!ret) 4962 if (!ret)
4695 goto pin; 4963 goto out;
4696 } 4964 }
4697 4965
4698 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { 4966 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
4699 pin_down_extent(root, cache, buf->start, buf->len, 1); 4967 pin_down_extent(root, cache, buf->start, buf->len, 1);
4700 goto pin; 4968 goto out;
4701 } 4969 }
4702 4970
4703 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); 4971 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
4704 4972
4705 btrfs_add_free_space(cache, buf->start, buf->len); 4973 btrfs_add_free_space(cache, buf->start, buf->len);
4706 ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0); 4974 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
4707 if (ret == -EAGAIN) {
4708 /* block group became read-only */
4709 btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
4710 goto out;
4711 }
4712
4713 ret = 1;
4714 spin_lock(&block_rsv->lock);
4715 if (block_rsv->reserved < block_rsv->size) {
4716 block_rsv->reserved += buf->len;
4717 ret = 0;
4718 }
4719 spin_unlock(&block_rsv->lock);
4720
4721 if (ret) {
4722 spin_lock(&cache->space_info->lock);
4723 cache->space_info->bytes_reserved -= buf->len;
4724 cache->space_info->reservation_progress++;
4725 spin_unlock(&cache->space_info->lock);
4726 }
4727 goto out;
4728 }
4729pin:
4730 if (block_rsv->durable && !cache->ro) {
4731 ret = 0;
4732 spin_lock(&cache->lock);
4733 if (!cache->ro) {
4734 cache->reserved_pinned += buf->len;
4735 ret = 1;
4736 }
4737 spin_unlock(&cache->lock);
4738
4739 if (ret) {
4740 spin_lock(&block_rsv->lock);
4741 block_rsv->freed[trans->transid & 0x1] += buf->len;
4742 spin_unlock(&block_rsv->lock);
4743 }
4744 } 4975 }
4745out: 4976out:
4746 /* 4977 /*
@@ -4883,10 +5114,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
4883 int last_ptr_loop = 0; 5114 int last_ptr_loop = 0;
4884 int loop = 0; 5115 int loop = 0;
4885 int index = 0; 5116 int index = 0;
5117 int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5118 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
4886 bool found_uncached_bg = false; 5119 bool found_uncached_bg = false;
4887 bool failed_cluster_refill = false; 5120 bool failed_cluster_refill = false;
4888 bool failed_alloc = false; 5121 bool failed_alloc = false;
4889 bool use_cluster = true; 5122 bool use_cluster = true;
5123 bool have_caching_bg = false;
4890 u64 ideal_cache_percent = 0; 5124 u64 ideal_cache_percent = 0;
4891 u64 ideal_cache_offset = 0; 5125 u64 ideal_cache_offset = 0;
4892 5126
@@ -4969,6 +5203,7 @@ ideal_cache:
4969 } 5203 }
4970 } 5204 }
4971search: 5205search:
5206 have_caching_bg = false;
4972 down_read(&space_info->groups_sem); 5207 down_read(&space_info->groups_sem);
4973 list_for_each_entry(block_group, &space_info->block_groups[index], 5208 list_for_each_entry(block_group, &space_info->block_groups[index],
4974 list) { 5209 list) {
@@ -4998,13 +5233,15 @@ search:
4998 } 5233 }
4999 5234
5000have_block_group: 5235have_block_group:
5001 if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { 5236 cached = block_group_cache_done(block_group);
5237 if (unlikely(!cached)) {
5002 u64 free_percent; 5238 u64 free_percent;
5003 5239
5240 found_uncached_bg = true;
5004 ret = cache_block_group(block_group, trans, 5241 ret = cache_block_group(block_group, trans,
5005 orig_root, 1); 5242 orig_root, 1);
5006 if (block_group->cached == BTRFS_CACHE_FINISHED) 5243 if (block_group->cached == BTRFS_CACHE_FINISHED)
5007 goto have_block_group; 5244 goto alloc;
5008 5245
5009 free_percent = btrfs_block_group_used(&block_group->item); 5246 free_percent = btrfs_block_group_used(&block_group->item);
5010 free_percent *= 100; 5247 free_percent *= 100;
@@ -5026,7 +5263,6 @@ have_block_group:
5026 orig_root, 0); 5263 orig_root, 0);
5027 BUG_ON(ret); 5264 BUG_ON(ret);
5028 } 5265 }
5029 found_uncached_bg = true;
5030 5266
5031 /* 5267 /*
5032 * If loop is set for cached only, try the next block 5268 * If loop is set for cached only, try the next block
@@ -5036,17 +5272,14 @@ have_block_group:
5036 goto loop; 5272 goto loop;
5037 } 5273 }
5038 5274
5039 cached = block_group_cache_done(block_group); 5275alloc:
5040 if (unlikely(!cached))
5041 found_uncached_bg = true;
5042
5043 if (unlikely(block_group->ro)) 5276 if (unlikely(block_group->ro))
5044 goto loop; 5277 goto loop;
5045 5278
5046 spin_lock(&block_group->free_space_ctl->tree_lock); 5279 spin_lock(&block_group->free_space_ctl->tree_lock);
5047 if (cached && 5280 if (cached &&
5048 block_group->free_space_ctl->free_space < 5281 block_group->free_space_ctl->free_space <
5049 num_bytes + empty_size) { 5282 num_bytes + empty_cluster + empty_size) {
5050 spin_unlock(&block_group->free_space_ctl->tree_lock); 5283 spin_unlock(&block_group->free_space_ctl->tree_lock);
5051 goto loop; 5284 goto loop;
5052 } 5285 }
@@ -5067,12 +5300,10 @@ have_block_group:
5067 * people trying to start a new cluster 5300 * people trying to start a new cluster
5068 */ 5301 */
5069 spin_lock(&last_ptr->refill_lock); 5302 spin_lock(&last_ptr->refill_lock);
5070 if (last_ptr->block_group && 5303 if (!last_ptr->block_group ||
5071 (last_ptr->block_group->ro || 5304 last_ptr->block_group->ro ||
5072 !block_group_bits(last_ptr->block_group, data))) { 5305 !block_group_bits(last_ptr->block_group, data))
5073 offset = 0;
5074 goto refill_cluster; 5306 goto refill_cluster;
5075 }
5076 5307
5077 offset = btrfs_alloc_from_cluster(block_group, last_ptr, 5308 offset = btrfs_alloc_from_cluster(block_group, last_ptr,
5078 num_bytes, search_start); 5309 num_bytes, search_start);
@@ -5123,7 +5354,7 @@ refill_cluster:
5123 /* allocate a cluster in this block group */ 5354 /* allocate a cluster in this block group */
5124 ret = btrfs_find_space_cluster(trans, root, 5355 ret = btrfs_find_space_cluster(trans, root,
5125 block_group, last_ptr, 5356 block_group, last_ptr,
5126 offset, num_bytes, 5357 search_start, num_bytes,
5127 empty_cluster + empty_size); 5358 empty_cluster + empty_size);
5128 if (ret == 0) { 5359 if (ret == 0) {
5129 /* 5360 /*
@@ -5177,6 +5408,8 @@ refill_cluster:
5177 failed_alloc = true; 5408 failed_alloc = true;
5178 goto have_block_group; 5409 goto have_block_group;
5179 } else if (!offset) { 5410 } else if (!offset) {
5411 if (!cached)
5412 have_caching_bg = true;
5180 goto loop; 5413 goto loop;
5181 } 5414 }
5182checks: 5415checks:
@@ -5202,8 +5435,8 @@ checks:
5202 search_start - offset); 5435 search_start - offset);
5203 BUG_ON(offset > search_start); 5436 BUG_ON(offset > search_start);
5204 5437
5205 ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1, 5438 ret = btrfs_update_reserved_bytes(block_group, num_bytes,
5206 (data & BTRFS_BLOCK_GROUP_DATA)); 5439 alloc_type);
5207 if (ret == -EAGAIN) { 5440 if (ret == -EAGAIN) {
5208 btrfs_add_free_space(block_group, offset, num_bytes); 5441 btrfs_add_free_space(block_group, offset, num_bytes);
5209 goto loop; 5442 goto loop;
@@ -5227,6 +5460,9 @@ loop:
5227 } 5460 }
5228 up_read(&space_info->groups_sem); 5461 up_read(&space_info->groups_sem);
5229 5462
5463 if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5464 goto search;
5465
5230 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES) 5466 if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5231 goto search; 5467 goto search;
5232 5468
@@ -5325,7 +5561,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5325 int index = 0; 5561 int index = 0;
5326 5562
5327 spin_lock(&info->lock); 5563 spin_lock(&info->lock);
5328 printk(KERN_INFO "space_info has %llu free, is %sfull\n", 5564 printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5565 (unsigned long long)info->flags,
5329 (unsigned long long)(info->total_bytes - info->bytes_used - 5566 (unsigned long long)(info->total_bytes - info->bytes_used -
5330 info->bytes_pinned - info->bytes_reserved - 5567 info->bytes_pinned - info->bytes_reserved -
5331 info->bytes_readonly), 5568 info->bytes_readonly),
@@ -5411,7 +5648,8 @@ again:
5411 return ret; 5648 return ret;
5412} 5649}
5413 5650
5414int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) 5651static int __btrfs_free_reserved_extent(struct btrfs_root *root,
5652 u64 start, u64 len, int pin)
5415{ 5653{
5416 struct btrfs_block_group_cache *cache; 5654 struct btrfs_block_group_cache *cache;
5417 int ret = 0; 5655 int ret = 0;
@@ -5426,8 +5664,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5426 if (btrfs_test_opt(root, DISCARD)) 5664 if (btrfs_test_opt(root, DISCARD))
5427 ret = btrfs_discard_extent(root, start, len, NULL); 5665 ret = btrfs_discard_extent(root, start, len, NULL);
5428 5666
5429 btrfs_add_free_space(cache, start, len); 5667 if (pin)
5430 btrfs_update_reserved_bytes(cache, len, 0, 1); 5668 pin_down_extent(root, cache, start, len, 1);
5669 else {
5670 btrfs_add_free_space(cache, start, len);
5671 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
5672 }
5431 btrfs_put_block_group(cache); 5673 btrfs_put_block_group(cache);
5432 5674
5433 trace_btrfs_reserved_extent_free(root, start, len); 5675 trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5677,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
5435 return ret; 5677 return ret;
5436} 5678}
5437 5679
5680int btrfs_free_reserved_extent(struct btrfs_root *root,
5681 u64 start, u64 len)
5682{
5683 return __btrfs_free_reserved_extent(root, start, len, 0);
5684}
5685
5686int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
5687 u64 start, u64 len)
5688{
5689 return __btrfs_free_reserved_extent(root, start, len, 1);
5690}
5691
5438static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, 5692static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
5439 struct btrfs_root *root, 5693 struct btrfs_root *root,
5440 u64 parent, u64 root_objectid, 5694 u64 parent, u64 root_objectid,
@@ -5630,7 +5884,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
5630 put_caching_control(caching_ctl); 5884 put_caching_control(caching_ctl);
5631 } 5885 }
5632 5886
5633 ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1); 5887 ret = btrfs_update_reserved_bytes(block_group, ins->offset,
5888 RESERVE_ALLOC_NO_ACCOUNT);
5634 BUG_ON(ret); 5889 BUG_ON(ret);
5635 btrfs_put_block_group(block_group); 5890 btrfs_put_block_group(block_group);
5636 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid, 5891 ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5942,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5687 block_rsv = get_block_rsv(trans, root); 5942 block_rsv = get_block_rsv(trans, root);
5688 5943
5689 if (block_rsv->size == 0) { 5944 if (block_rsv->size == 0) {
5690 ret = reserve_metadata_bytes(trans, root, block_rsv, 5945 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5691 blocksize, 0);
5692 /* 5946 /*
5693 * If we couldn't reserve metadata bytes try and use some from 5947 * If we couldn't reserve metadata bytes try and use some from
5694 * the global reserve. 5948 * the global reserve.
@@ -5708,13 +5962,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
5708 if (!ret) 5962 if (!ret)
5709 return block_rsv; 5963 return block_rsv;
5710 if (ret) { 5964 if (ret) {
5711 WARN_ON(1); 5965 static DEFINE_RATELIMIT_STATE(_rs,
5712 ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize, 5966 DEFAULT_RATELIMIT_INTERVAL,
5713 0); 5967 /*DEFAULT_RATELIMIT_BURST*/ 2);
5968 if (__ratelimit(&_rs)) {
5969 printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
5970 WARN_ON(1);
5971 }
5972 ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
5714 if (!ret) { 5973 if (!ret) {
5715 spin_lock(&block_rsv->lock);
5716 block_rsv->size += blocksize;
5717 spin_unlock(&block_rsv->lock);
5718 return block_rsv; 5974 return block_rsv;
5719 } else if (ret && block_rsv != global_rsv) { 5975 } else if (ret && block_rsv != global_rsv) {
5720 ret = block_rsv_use_bytes(global_rsv, blocksize); 5976 ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6848,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
6592 cache->bytes_super - btrfs_block_group_used(&cache->item); 6848 cache->bytes_super - btrfs_block_group_used(&cache->item);
6593 6849
6594 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned + 6850 if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
6595 sinfo->bytes_may_use + sinfo->bytes_readonly + 6851 sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
6596 cache->reserved_pinned + num_bytes + min_allocable_bytes <= 6852 min_allocable_bytes <= sinfo->total_bytes) {
6597 sinfo->total_bytes) {
6598 sinfo->bytes_readonly += num_bytes; 6853 sinfo->bytes_readonly += num_bytes;
6599 sinfo->bytes_reserved += cache->reserved_pinned;
6600 cache->reserved_pinned = 0;
6601 cache->ro = 1; 6854 cache->ro = 1;
6602 ret = 0; 6855 ret = 0;
6603 } 6856 }
@@ -6964,7 +7217,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
6964 struct btrfs_space_info, 7217 struct btrfs_space_info,
6965 list); 7218 list);
6966 if (space_info->bytes_pinned > 0 || 7219 if (space_info->bytes_pinned > 0 ||
6967 space_info->bytes_reserved > 0) { 7220 space_info->bytes_reserved > 0 ||
7221 space_info->bytes_may_use > 0) {
6968 WARN_ON(1); 7222 WARN_ON(1);
6969 dump_space_info(space_info, 0, 0); 7223 dump_space_info(space_info, 0, 0);
6970 } 7224 }
@@ -7006,14 +7260,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
7006 return -ENOMEM; 7260 return -ENOMEM;
7007 path->reada = 1; 7261 path->reada = 1;
7008 7262
7009 cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy); 7263 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7010 if (cache_gen != 0 && 7264 if (btrfs_test_opt(root, SPACE_CACHE) &&
7011 btrfs_super_generation(&root->fs_info->super_copy) != cache_gen) 7265 btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7012 need_clear = 1; 7266 need_clear = 1;
7013 if (btrfs_test_opt(root, CLEAR_CACHE)) 7267 if (btrfs_test_opt(root, CLEAR_CACHE))
7014 need_clear = 1; 7268 need_clear = 1;
7015 if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
7016 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
7017 7269
7018 while (1) { 7270 while (1) {
7019 ret = find_first_block_group(root, path, &key); 7271 ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7504,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7252 goto out; 7504 goto out;
7253 } 7505 }
7254 7506
7255 inode = lookup_free_space_inode(root, block_group, path); 7507 inode = lookup_free_space_inode(tree_root, block_group, path);
7256 if (!IS_ERR(inode)) { 7508 if (!IS_ERR(inode)) {
7257 ret = btrfs_orphan_add(trans, inode); 7509 ret = btrfs_orphan_add(trans, inode);
7258 BUG_ON(ret); 7510 BUG_ON(ret);
@@ -7268,7 +7520,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7268 spin_unlock(&block_group->lock); 7520 spin_unlock(&block_group->lock);
7269 } 7521 }
7270 /* One for our lookup ref */ 7522 /* One for our lookup ref */
7271 iput(inode); 7523 btrfs_add_delayed_iput(inode);
7272 } 7524 }
7273 7525
7274 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 7526 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7591,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
7339 int mixed = 0; 7591 int mixed = 0;
7340 int ret; 7592 int ret;
7341 7593
7342 disk_super = &fs_info->super_copy; 7594 disk_super = fs_info->super_copy;
7343 if (!btrfs_super_root(disk_super)) 7595 if (!btrfs_super_root(disk_super))
7344 return 1; 7596 return 1;
7345 7597
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..be1bf627a14b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,194 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc)
939 return -ENOMEM;
940 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL;
942 BUG_ON(err == -EEXIST);
943 goto out;
944 }
945 state = rb_entry(node, struct extent_state, rb_node);
946hit_next:
947 last_start = state->start;
948 last_end = state->end;
949
950 /*
951 * | ---- desired range ---- |
952 * | state |
953 *
954 * Just lock what we found and keep going
955 */
956 if (state->start == start && state->end <= end) {
957 struct rb_node *next_node;
958
959 set_state_bits(tree, state, &bits);
960 clear_state_bit(tree, state, &clear_bits, 0);
961
962 merge_state(tree, state);
963 if (last_end == (u64)-1)
964 goto out;
965
966 start = last_end + 1;
967 next_node = rb_next(&state->rb_node);
968 if (next_node && start < end && prealloc && !need_resched()) {
969 state = rb_entry(next_node, struct extent_state,
970 rb_node);
971 if (state->start == start)
972 goto hit_next;
973 }
974 goto search_again;
975 }
976
977 /*
978 * | ---- desired range ---- |
979 * | state |
980 * or
981 * | ------------- state -------------- |
982 *
983 * We need to split the extent we found, and may flip bits on
984 * second half.
985 *
986 * If the extent we found extends past our
987 * range, we just split and search again. It'll get split
988 * again the next time though.
989 *
990 * If the extent we found is inside our range, we set the
991 * desired bit on it.
992 */
993 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc)
996 return -ENOMEM;
997 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST);
999 prealloc = NULL;
1000 if (err)
1001 goto out;
1002 if (state->end <= end) {
1003 set_state_bits(tree, state, &bits);
1004 clear_state_bit(tree, state, &clear_bits, 0);
1005 merge_state(tree, state);
1006 if (last_end == (u64)-1)
1007 goto out;
1008 start = last_end + 1;
1009 }
1010 goto search_again;
1011 }
1012 /*
1013 * | ---- desired range ---- |
1014 * | state | or | state |
1015 *
1016 * There's a hole, we need to insert something in it and
1017 * ignore the extent we found.
1018 */
1019 if (state->start > start) {
1020 u64 this_end;
1021 if (end < last_start)
1022 this_end = end;
1023 else
1024 this_end = last_start - 1;
1025
1026 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc)
1028 return -ENOMEM;
1029
1030 /*
1031 * Avoid to free 'prealloc' if it can be merged with
1032 * the later extent.
1033 */
1034 err = insert_state(tree, prealloc, start, this_end,
1035 &bits);
1036 BUG_ON(err == -EEXIST);
1037 if (err) {
1038 free_extent_state(prealloc);
1039 prealloc = NULL;
1040 goto out;
1041 }
1042 prealloc = NULL;
1043 start = this_end + 1;
1044 goto search_again;
1045 }
1046 /*
1047 * | ---- desired range ---- |
1048 * | state |
1049 * We need to split the extent, and set the bit
1050 * on the first half
1051 */
1052 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc)
1055 return -ENOMEM;
1056
1057 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST);
1059
1060 set_state_bits(tree, prealloc, &bits);
1061 clear_state_bit(tree, prealloc, &clear_bits, 0);
1062
1063 merge_state(tree, prealloc);
1064 prealloc = NULL;
1065 goto out;
1066 }
1067
1068 goto search_again;
1069
1070out:
1071 spin_unlock(&tree->lock);
1072 if (prealloc)
1073 free_extent_state(prealloc);
1074
1075 return err;
1076
1077search_again:
1078 if (start > end)
1079 goto out;
1080 spin_unlock(&tree->lock);
1081 if (mask & __GFP_WAIT)
1082 cond_resched();
1083 goto again;
1084}
1085
897/* wrappers around set/clear extent bit */ 1086/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1087int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1088 gfp_t mask)
@@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1108 struct extent_state **cached_state, gfp_t mask)
920{ 1109{
921 return set_extent_bit(tree, start, end, 1110 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1111 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1112 0, NULL, cached_state, mask);
924} 1113}
925 1114
@@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1788 return 0;
1600} 1789}
1601 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2154
1604/* 2155/*
@@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2249 struct extent_state *state;
1699 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2255
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,12 +2281,26 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2281 state);
1728 if (ret) 2282 if (ret)
1729 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1730 } 2286 }
1731 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2288 int failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (int)(unsigned long)bio->bi_bdev;
1734 start, end, NULL); 2290 /*
2291 * The generic bio_readpage_error handles errors the
2292 * following way: If possible, new read requests are
2293 * created and submitted and will end up in
2294 * end_bio_extent_readpage as well (if we're lucky, not
2295 * in the !uptodate case). In that case it returns 0 and
2296 * we just go on with the next page in our bio. If it
2297 * can't handle the error it will return -EIO and we
2298 * remain responsible for that page.
2299 */
2300 ret = bio_readpage_error(bio, page, start, end,
2301 failed_mirror, NULL);
1735 if (ret == 0) { 2302 if (ret == 0) {
2303error_handled:
1736 uptodate = 2304 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2305 test_bit(BIO_UPTODATE, &bio->bi_flags);
1738 if (err) 2306 if (err)
@@ -1740,6 +2308,13 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1740 uncache_state(&cached); 2308 uncache_state(&cached);
1741 continue; 2309 continue;
1742 } 2310 }
2311 if (tree->ops && tree->ops->readpage_io_failed_hook) {
2312 ret = tree->ops->readpage_io_failed_hook(
2313 bio, page, start, end,
2314 failed_mirror, state);
2315 if (ret == 0)
2316 goto error_handled;
2317 }
1743 } 2318 }
1744 2319
1745 if (uptodate) { 2320 if (uptodate) {
@@ -1811,6 +2386,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2386 mirror_num, bio_flags, start);
1812 else 2387 else
1813 submit_bio(rw, bio); 2388 submit_bio(rw, bio);
2389
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2390 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2391 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2392 bio_put(bio);
@@ -2076,16 +2652,16 @@ out:
2076} 2652}
2077 2653
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2654int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2655 get_extent_t *get_extent, int mirror_num)
2080{ 2656{
2081 struct bio *bio = NULL; 2657 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2658 unsigned long bio_flags = 0;
2083 int ret; 2659 int ret;
2084 2660
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2661 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2662 &bio_flags);
2087 if (bio) 2663 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2664 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2665 return ret;
2090} 2666}
2091 2667
@@ -2136,6 +2712,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2712 int compressed;
2137 int write_flags; 2713 int write_flags;
2138 unsigned long nr_written = 0; 2714 unsigned long nr_written = 0;
2715 bool fill_delalloc = true;
2139 2716
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2717 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2718 write_flags = WRITE_SYNC;
@@ -2145,6 +2722,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2722 trace___extent_writepage(page, inode, wbc);
2146 2723
2147 WARN_ON(!PageLocked(page)); 2724 WARN_ON(!PageLocked(page));
2725
2726 ClearPageError(page);
2727
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2728 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2729 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2730 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2746,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2746
2167 set_page_extent_mapped(page); 2747 set_page_extent_mapped(page);
2168 2748
2749 if (!tree->ops || !tree->ops->fill_delalloc)
2750 fill_delalloc = false;
2751
2169 delalloc_start = start; 2752 delalloc_start = start;
2170 delalloc_end = 0; 2753 delalloc_end = 0;
2171 page_started = 0; 2754 page_started = 0;
2172 if (!epd->extent_locked) { 2755 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2756 u64 delalloc_to_write = 0;
2174 /* 2757 /*
2175 * make sure the wbc mapping index is at least updated 2758 * make sure the wbc mapping index is at least updated
@@ -2421,10 +3004,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 3004 * swizzled back from swapper_space to tmpfs file
2422 * mapping 3005 * mapping
2423 */ 3006 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 3007 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 3008 tree->ops->write_cache_pages_lock_hook) {
2426 else 3009 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 3010 data, flush_fn);
3011 } else {
3012 if (!trylock_page(page)) {
3013 flush_fn(data);
3014 lock_page(page);
3015 }
3016 }
2428 3017
2429 if (unlikely(page->mapping != mapping)) { 3018 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3019 unlock_page(page);
@@ -2790,6 +3379,9 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2790 return -ENOMEM; 3379 return -ENOMEM;
2791 path->leave_spinning = 1; 3380 path->leave_spinning = 1;
2792 3381
3382 start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3383 len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3384
2793 /* 3385 /*
2794 * lookup the last file extent. We're not using i_size here 3386 * lookup the last file extent. We're not using i_size here
2795 * because there might be preallocation past i_size 3387 * because there might be preallocation past i_size
@@ -2837,7 +3429,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2837 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0, 3429 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2838 &cached_state, GFP_NOFS); 3430 &cached_state, GFP_NOFS);
2839 3431
2840 em = get_extent_skip_holes(inode, off, last_for_get_extent, 3432 em = get_extent_skip_holes(inode, start, last_for_get_extent,
2841 get_extent); 3433 get_extent);
2842 if (!em) 3434 if (!em)
2843 goto out; 3435 goto out;
@@ -2926,7 +3518,7 @@ out:
2926 return ret; 3518 return ret;
2927} 3519}
2928 3520
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3521inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3522 unsigned long i)
2931{ 3523{
2932 struct page *p; 3524 struct page *p;
@@ -2951,7 +3543,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3543 return p;
2952} 3544}
2953 3545
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3546inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3547{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3548 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3549 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3796,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3796 PAGECACHE_TAG_DIRTY);
3205 } 3797 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3798 spin_unlock_irq(&page->mapping->tree_lock);
3799 ClearPageError(page);
3207 unlock_page(page); 3800 unlock_page(page);
3208 } 3801 }
3209 return 0; 3802 return 0;
@@ -3349,8 +3942,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3942}
3350 3943
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3944int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3945 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3946 get_extent_t *get_extent, int mirror_num)
3355{ 3947{
3356 unsigned long i; 3948 unsigned long i;
@@ -3386,7 +3978,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3978 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3979 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3980 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3981 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3982 if (!trylock_page(page))
3391 goto unlock_exit; 3983 goto unlock_exit;
3392 } else { 3984 } else {
@@ -3430,7 +4022,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4022 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4023 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4024
3433 if (ret || !wait) 4025 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4026 return ret;
3435 4027
3436 for (i = start_i; i < num_pages; i++) { 4028 for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..7604c3001322 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,8 @@
17#define EXTENT_NODATASUM (1 << 10) 17#define EXTENT_NODATASUM (1 << 10)
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
20#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
21#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
22 24
@@ -32,6 +34,7 @@
32#define EXTENT_BUFFER_BLOCKING 1 34#define EXTENT_BUFFER_BLOCKING 1
33#define EXTENT_BUFFER_DIRTY 2 35#define EXTENT_BUFFER_DIRTY 2
34#define EXTENT_BUFFER_CORRUPT 3 36#define EXTENT_BUFFER_CORRUPT 3
37#define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */
35 38
36/* these are flags for extent_clear_unlock_delalloc */ 39/* these are flags for extent_clear_unlock_delalloc */
37#define EXTENT_CLEAR_UNLOCK_PAGE 0x1 40#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -67,7 +70,7 @@ struct extent_io_ops {
67 unsigned long bio_flags); 70 unsigned long bio_flags);
68 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
69 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
70 u64 start, u64 end, 73 u64 start, u64 end, int failed_mirror,
71 struct extent_state *state); 74 struct extent_state *state);
72 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
73 u64 start, u64 end, 76 u64 start, u64 end,
@@ -85,7 +88,8 @@ struct extent_io_ops {
85 struct extent_state *other); 88 struct extent_state *other);
86 void (*split_extent_hook)(struct inode *inode, 89 void (*split_extent_hook)(struct inode *inode,
87 struct extent_state *orig, u64 split); 90 struct extent_state *orig, u64 split);
88 int (*write_cache_pages_lock_hook)(struct page *page); 91 int (*write_cache_pages_lock_hook)(struct page *page, void *data,
92 void (*flush_fn)(void *));
89}; 93};
90 94
91struct extent_io_tree { 95struct extent_io_tree {
@@ -185,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
185int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
186 gfp_t mask); 190 gfp_t mask);
187int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
188 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
189int __init extent_io_init(void); 193int __init extent_io_init(void);
190void extent_io_exit(void); 194void extent_io_exit(void);
191 195
@@ -214,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
214 gfp_t mask); 218 gfp_t mask);
215int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 219int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
216 gfp_t mask); 220 gfp_t mask);
221int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
222 int bits, int clear_bits, gfp_t mask);
217int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 223int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
218 struct extent_state **cached_state, gfp_t mask); 224 struct extent_state **cached_state, gfp_t mask);
219int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 225int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -248,9 +254,14 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
248struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, 254struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
249 u64 start, unsigned long len); 255 u64 start, unsigned long len);
250void free_extent_buffer(struct extent_buffer *eb); 256void free_extent_buffer(struct extent_buffer *eb);
257#define WAIT_NONE 0
258#define WAIT_COMPLETE 1
259#define WAIT_PAGE_LOCK 2
251int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
252 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
253 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
254 265
255static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
256{ 267{
@@ -300,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
300struct bio * 311struct bio *
301btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
302 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
303#endif 320#endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
91 struct btrfs_csum_item *item; 91 struct btrfs_csum_item *item;
92 struct extent_buffer *leaf; 92 struct extent_buffer *leaf;
93 u64 csum_offset = 0; 93 u64 csum_offset = 0;
94 u16 csum_size = 94 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
95 btrfs_super_csum_size(&root->fs_info->super_copy);
96 int csums_in_item; 95 int csums_in_item;
97 96
98 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; 97 file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
162 u64 item_last_offset = 0; 161 u64 item_last_offset = 0;
163 u64 disk_bytenr; 162 u64 disk_bytenr;
164 u32 diff; 163 u32 diff;
165 u16 csum_size = 164 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
166 btrfs_super_csum_size(&root->fs_info->super_copy);
167 int ret; 165 int ret;
168 struct btrfs_path *path; 166 struct btrfs_path *path;
169 struct btrfs_csum_item *item = NULL; 167 struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
290 int ret; 288 int ret;
291 size_t size; 289 size_t size;
292 u64 csum_end; 290 u64 csum_end;
293 u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); 291 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
294 292
295 path = btrfs_alloc_path(); 293 path = btrfs_alloc_path();
296 if (!path) 294 if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
492 u64 bytenr, u64 len) 490 u64 bytenr, u64 len)
493{ 491{
494 struct extent_buffer *leaf; 492 struct extent_buffer *leaf;
495 u16 csum_size = 493 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
496 btrfs_super_csum_size(&root->fs_info->super_copy);
497 u64 csum_end; 494 u64 csum_end;
498 u64 end_byte = bytenr + len; 495 u64 end_byte = bytenr + len;
499 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; 496 u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
549 u64 csum_end; 546 u64 csum_end;
550 struct extent_buffer *leaf; 547 struct extent_buffer *leaf;
551 int ret; 548 int ret;
552 u16 csum_size = 549 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
553 btrfs_super_csum_size(&root->fs_info->super_copy);
554 int blocksize_bits = root->fs_info->sb->s_blocksize_bits; 550 int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
555 551
556 root = root->fs_info->csum_root; 552 root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
676 struct btrfs_sector_sum *sector_sum; 672 struct btrfs_sector_sum *sector_sum;
677 u32 nritems; 673 u32 nritems;
678 u32 ins_size; 674 u32 ins_size;
679 u16 csum_size = 675 u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
680 btrfs_super_csum_size(&root->fs_info->super_copy);
681 676
682 path = btrfs_alloc_path(); 677 path = btrfs_alloc_path();
683 if (!path) 678 if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb7..dafdfa059bf6 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1069 int i; 1069 int i;
1070 unsigned long index = pos >> PAGE_CACHE_SHIFT; 1070 unsigned long index = pos >> PAGE_CACHE_SHIFT;
1071 struct inode *inode = fdentry(file)->d_inode; 1071 struct inode *inode = fdentry(file)->d_inode;
1072 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1072 int err = 0; 1073 int err = 0;
1073 int faili = 0; 1074 int faili = 0;
1074 u64 start_pos; 1075 u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
1080again: 1081again:
1081 for (i = 0; i < num_pages; i++) { 1082 for (i = 0; i < num_pages; i++) {
1082 pages[i] = find_or_create_page(inode->i_mapping, index + i, 1083 pages[i] = find_or_create_page(inode->i_mapping, index + i,
1083 GFP_NOFS); 1084 mask);
1084 if (!pages[i]) { 1085 if (!pages[i]) {
1085 faili = i - 1; 1086 faili = i - 1;
1086 err = -ENOMEM; 1087 err = -ENOMEM;
@@ -1615,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1615 goto out; 1616 goto out;
1616 } 1617 }
1617 1618
1618 ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
1619 if (ret)
1620 goto out;
1621
1622 locked_end = alloc_end - 1; 1619 locked_end = alloc_end - 1;
1623 while (1) { 1620 while (1) {
1624 struct btrfs_ordered_extent *ordered; 1621 struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
1664 if (em->block_start == EXTENT_MAP_HOLE || 1661 if (em->block_start == EXTENT_MAP_HOLE ||
1665 (cur_offset >= inode->i_size && 1662 (cur_offset >= inode->i_size &&
1666 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { 1663 !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
1664
1665 /*
1666 * Make sure we have enough space before we do the
1667 * allocation.
1668 */
1669 ret = btrfs_check_data_free_space(inode, last_byte -
1670 cur_offset);
1671 if (ret) {
1672 free_extent_map(em);
1673 break;
1674 }
1675
1667 ret = btrfs_prealloc_file_range(inode, mode, cur_offset, 1676 ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
1668 last_byte - cur_offset, 1677 last_byte - cur_offset,
1669 1 << inode->i_blkbits, 1678 1 << inode->i_blkbits,
1670 offset + len, 1679 offset + len,
1671 &alloc_hint); 1680 &alloc_hint);
1681
1682 /* Let go of our reservation. */
1683 btrfs_free_reserved_data_space(inode, last_byte -
1684 cur_offset);
1672 if (ret < 0) { 1685 if (ret < 0) {
1673 free_extent_map(em); 1686 free_extent_map(em);
1674 break; 1687 break;
@@ -1694,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
1694 } 1707 }
1695 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 1708 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
1696 &cached_state, GFP_NOFS); 1709 &cached_state, GFP_NOFS);
1697
1698 btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
1699out: 1710out:
1700 mutex_unlock(&inode->i_mutex); 1711 mutex_unlock(&inode->i_mutex);
1701 return ret; 1712 return ret;
@@ -1821,7 +1832,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int origin)
1821 switch (origin) { 1832 switch (origin) {
1822 case SEEK_END: 1833 case SEEK_END:
1823 case SEEK_CUR: 1834 case SEEK_CUR:
1824 offset = generic_file_llseek_unlocked(file, offset, origin); 1835 offset = generic_file_llseek(file, offset, origin);
1825 goto out; 1836 goto out;
1826 case SEEK_DATA: 1837 case SEEK_DATA:
1827 case SEEK_HOLE: 1838 case SEEK_HOLE:
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..ec23d43d0c35 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> 20#include <linux/sched.h>
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/math64.h> 22#include <linux/math64.h>
23#include <linux/ratelimit.h>
23#include "ctree.h" 24#include "ctree.h"
24#include "free-space-cache.h" 25#include "free-space-cache.h"
25#include "transaction.h" 26#include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
84 *block_group, struct btrfs_path *path) 85 *block_group, struct btrfs_path *path)
85{ 86{
86 struct inode *inode = NULL; 87 struct inode *inode = NULL;
88 u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
87 89
88 spin_lock(&block_group->lock); 90 spin_lock(&block_group->lock);
89 if (block_group->inode) 91 if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
98 return inode; 100 return inode;
99 101
100 spin_lock(&block_group->lock); 102 spin_lock(&block_group->lock);
101 if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) { 103 if (!((BTRFS_I(inode)->flags & flags) == flags)) {
102 printk(KERN_INFO "Old style space inode found, converting.\n"); 104 printk(KERN_INFO "Old style space inode found, converting.\n");
103 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM; 105 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
106 BTRFS_INODE_NODATACOW;
104 block_group->disk_cache_state = BTRFS_DC_CLEAR; 107 block_group->disk_cache_state = BTRFS_DC_CLEAR;
105 } 108 }
106 109
107 if (!btrfs_fs_closing(root->fs_info)) { 110 if (!block_group->iref) {
108 block_group->inode = igrab(inode); 111 block_group->inode = igrab(inode);
109 block_group->iref = 1; 112 block_group->iref = 1;
110 } 113 }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
122 struct btrfs_free_space_header *header; 125 struct btrfs_free_space_header *header;
123 struct btrfs_inode_item *inode_item; 126 struct btrfs_inode_item *inode_item;
124 struct extent_buffer *leaf; 127 struct extent_buffer *leaf;
128 u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
125 int ret; 129 int ret;
126 130
127 ret = btrfs_insert_empty_inode(trans, root, path, ino); 131 ret = btrfs_insert_empty_inode(trans, root, path, ino);
128 if (ret) 132 if (ret)
129 return ret; 133 return ret;
130 134
135 /* We inline crc's for the free disk space cache */
136 if (ino != BTRFS_FREE_INO_OBJECTID)
137 flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
138
131 leaf = path->nodes[0]; 139 leaf = path->nodes[0];
132 inode_item = btrfs_item_ptr(leaf, path->slots[0], 140 inode_item = btrfs_item_ptr(leaf, path->slots[0],
133 struct btrfs_inode_item); 141 struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
140 btrfs_set_inode_uid(leaf, inode_item, 0); 148 btrfs_set_inode_uid(leaf, inode_item, 0);
141 btrfs_set_inode_gid(leaf, inode_item, 0); 149 btrfs_set_inode_gid(leaf, inode_item, 0);
142 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600); 150 btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
143 btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS | 151 btrfs_set_inode_flags(leaf, inode_item, flags);
144 BTRFS_INODE_PREALLOC);
145 btrfs_set_inode_nlink(leaf, inode_item, 1); 152 btrfs_set_inode_nlink(leaf, inode_item, 1);
146 btrfs_set_inode_transid(leaf, inode_item, trans->transid); 153 btrfs_set_inode_transid(leaf, inode_item, trans->transid);
147 btrfs_set_inode_block_group(leaf, inode_item, offset); 154 btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
191 struct inode *inode) 198 struct inode *inode)
192{ 199{
193 struct btrfs_block_rsv *rsv; 200 struct btrfs_block_rsv *rsv;
201 u64 needed_bytes;
194 loff_t oldsize; 202 loff_t oldsize;
195 int ret = 0; 203 int ret = 0;
196 204
197 rsv = trans->block_rsv; 205 rsv = trans->block_rsv;
198 trans->block_rsv = root->orphan_block_rsv; 206 trans->block_rsv = &root->fs_info->global_block_rsv;
199 ret = btrfs_block_rsv_check(trans, root, 207
200 root->orphan_block_rsv, 208 /* 1 for slack space, 1 for updating the inode */
201 0, 5); 209 needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
202 if (ret) 210 btrfs_calc_trans_metadata_size(root, 1);
203 return ret; 211
212 spin_lock(&trans->block_rsv->lock);
213 if (trans->block_rsv->reserved < needed_bytes) {
214 spin_unlock(&trans->block_rsv->lock);
215 trans->block_rsv = rsv;
216 return -ENOSPC;
217 }
218 spin_unlock(&trans->block_rsv->lock);
204 219
205 oldsize = i_size_read(inode); 220 oldsize = i_size_read(inode);
206 btrfs_i_size_write(inode, 0); 221 btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
213 ret = btrfs_truncate_inode_items(trans, root, inode, 228 ret = btrfs_truncate_inode_items(trans, root, inode,
214 0, BTRFS_EXTENT_DATA_KEY); 229 0, BTRFS_EXTENT_DATA_KEY);
215 230
216 trans->block_rsv = rsv;
217 if (ret) { 231 if (ret) {
232 trans->block_rsv = rsv;
218 WARN_ON(1); 233 WARN_ON(1);
219 return ret; 234 return ret;
220 } 235 }
221 236
222 ret = btrfs_update_inode(trans, root, inode); 237 ret = btrfs_update_inode(trans, root, inode);
238 trans->block_rsv = rsv;
239
223 return ret; 240 return ret;
224} 241}
225 242
@@ -242,26 +259,348 @@ static int readahead_cache(struct inode *inode)
242 return 0; 259 return 0;
243} 260}
244 261
262struct io_ctl {
263 void *cur, *orig;
264 struct page *page;
265 struct page **pages;
266 struct btrfs_root *root;
267 unsigned long size;
268 int index;
269 int num_pages;
270 unsigned check_crcs:1;
271};
272
273static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
274 struct btrfs_root *root)
275{
276 memset(io_ctl, 0, sizeof(struct io_ctl));
277 io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
278 PAGE_CACHE_SHIFT;
279 io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
280 GFP_NOFS);
281 if (!io_ctl->pages)
282 return -ENOMEM;
283 io_ctl->root = root;
284 if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
285 io_ctl->check_crcs = 1;
286 return 0;
287}
288
289static void io_ctl_free(struct io_ctl *io_ctl)
290{
291 kfree(io_ctl->pages);
292}
293
294static void io_ctl_unmap_page(struct io_ctl *io_ctl)
295{
296 if (io_ctl->cur) {
297 kunmap(io_ctl->page);
298 io_ctl->cur = NULL;
299 io_ctl->orig = NULL;
300 }
301}
302
303static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
304{
305 WARN_ON(io_ctl->cur);
306 BUG_ON(io_ctl->index >= io_ctl->num_pages);
307 io_ctl->page = io_ctl->pages[io_ctl->index++];
308 io_ctl->cur = kmap(io_ctl->page);
309 io_ctl->orig = io_ctl->cur;
310 io_ctl->size = PAGE_CACHE_SIZE;
311 if (clear)
312 memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
313}
314
315static void io_ctl_drop_pages(struct io_ctl *io_ctl)
316{
317 int i;
318
319 io_ctl_unmap_page(io_ctl);
320
321 for (i = 0; i < io_ctl->num_pages; i++) {
322 ClearPageChecked(io_ctl->pages[i]);
323 unlock_page(io_ctl->pages[i]);
324 page_cache_release(io_ctl->pages[i]);
325 }
326}
327
328static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
329 int uptodate)
330{
331 struct page *page;
332 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
333 int i;
334
335 for (i = 0; i < io_ctl->num_pages; i++) {
336 page = find_or_create_page(inode->i_mapping, i, mask);
337 if (!page) {
338 io_ctl_drop_pages(io_ctl);
339 return -ENOMEM;
340 }
341 io_ctl->pages[i] = page;
342 if (uptodate && !PageUptodate(page)) {
343 btrfs_readpage(NULL, page);
344 lock_page(page);
345 if (!PageUptodate(page)) {
346 printk(KERN_ERR "btrfs: error reading free "
347 "space cache\n");
348 io_ctl_drop_pages(io_ctl);
349 return -EIO;
350 }
351 }
352 }
353
354 for (i = 0; i < io_ctl->num_pages; i++) {
355 clear_page_dirty_for_io(io_ctl->pages[i]);
356 set_page_extent_mapped(io_ctl->pages[i]);
357 }
358
359 return 0;
360}
361
362static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
363{
364 u64 *val;
365
366 io_ctl_map_page(io_ctl, 1);
367
368 /*
369 * Skip the csum areas. If we don't check crcs then we just have a
370 * 64bit chunk at the front of the first page.
371 */
372 if (io_ctl->check_crcs) {
373 io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
374 io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
375 } else {
376 io_ctl->cur += sizeof(u64);
377 io_ctl->size -= sizeof(u64) * 2;
378 }
379
380 val = io_ctl->cur;
381 *val = cpu_to_le64(generation);
382 io_ctl->cur += sizeof(u64);
383}
384
385static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
386{
387 u64 *gen;
388
389 /*
390 * Skip the crc area. If we don't check crcs then we just have a 64bit
391 * chunk at the front of the first page.
392 */
393 if (io_ctl->check_crcs) {
394 io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
395 io_ctl->size -= sizeof(u64) +
396 (sizeof(u32) * io_ctl->num_pages);
397 } else {
398 io_ctl->cur += sizeof(u64);
399 io_ctl->size -= sizeof(u64) * 2;
400 }
401
402 gen = io_ctl->cur;
403 if (le64_to_cpu(*gen) != generation) {
404 printk_ratelimited(KERN_ERR "btrfs: space cache generation "
405 "(%Lu) does not match inode (%Lu)\n", *gen,
406 generation);
407 io_ctl_unmap_page(io_ctl);
408 return -EIO;
409 }
410 io_ctl->cur += sizeof(u64);
411 return 0;
412}
413
414static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
415{
416 u32 *tmp;
417 u32 crc = ~(u32)0;
418 unsigned offset = 0;
419
420 if (!io_ctl->check_crcs) {
421 io_ctl_unmap_page(io_ctl);
422 return;
423 }
424
425 if (index == 0)
426 offset = sizeof(u32) * io_ctl->num_pages;;
427
428 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
429 PAGE_CACHE_SIZE - offset);
430 btrfs_csum_final(crc, (char *)&crc);
431 io_ctl_unmap_page(io_ctl);
432 tmp = kmap(io_ctl->pages[0]);
433 tmp += index;
434 *tmp = crc;
435 kunmap(io_ctl->pages[0]);
436}
437
438static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
439{
440 u32 *tmp, val;
441 u32 crc = ~(u32)0;
442 unsigned offset = 0;
443
444 if (!io_ctl->check_crcs) {
445 io_ctl_map_page(io_ctl, 0);
446 return 0;
447 }
448
449 if (index == 0)
450 offset = sizeof(u32) * io_ctl->num_pages;
451
452 tmp = kmap(io_ctl->pages[0]);
453 tmp += index;
454 val = *tmp;
455 kunmap(io_ctl->pages[0]);
456
457 io_ctl_map_page(io_ctl, 0);
458 crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
459 PAGE_CACHE_SIZE - offset);
460 btrfs_csum_final(crc, (char *)&crc);
461 if (val != crc) {
462 printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
463 "space cache\n");
464 io_ctl_unmap_page(io_ctl);
465 return -EIO;
466 }
467
468 return 0;
469}
470
471static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
472 void *bitmap)
473{
474 struct btrfs_free_space_entry *entry;
475
476 if (!io_ctl->cur)
477 return -ENOSPC;
478
479 entry = io_ctl->cur;
480 entry->offset = cpu_to_le64(offset);
481 entry->bytes = cpu_to_le64(bytes);
482 entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
483 BTRFS_FREE_SPACE_EXTENT;
484 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
485 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
486
487 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
488 return 0;
489
490 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
491
492 /* No more pages to map */
493 if (io_ctl->index >= io_ctl->num_pages)
494 return 0;
495
496 /* map the next page */
497 io_ctl_map_page(io_ctl, 1);
498 return 0;
499}
500
501static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
502{
503 if (!io_ctl->cur)
504 return -ENOSPC;
505
506 /*
507 * If we aren't at the start of the current page, unmap this one and
508 * map the next one if there is any left.
509 */
510 if (io_ctl->cur != io_ctl->orig) {
511 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
512 if (io_ctl->index >= io_ctl->num_pages)
513 return -ENOSPC;
514 io_ctl_map_page(io_ctl, 0);
515 }
516
517 memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
518 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
519 if (io_ctl->index < io_ctl->num_pages)
520 io_ctl_map_page(io_ctl, 0);
521 return 0;
522}
523
524static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
525{
526 /*
527 * If we're not on the boundary we know we've modified the page and we
528 * need to crc the page.
529 */
530 if (io_ctl->cur != io_ctl->orig)
531 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
532 else
533 io_ctl_unmap_page(io_ctl);
534
535 while (io_ctl->index < io_ctl->num_pages) {
536 io_ctl_map_page(io_ctl, 1);
537 io_ctl_set_crc(io_ctl, io_ctl->index - 1);
538 }
539}
540
541static int io_ctl_read_entry(struct io_ctl *io_ctl,
542 struct btrfs_free_space *entry, u8 *type)
543{
544 struct btrfs_free_space_entry *e;
545 int ret;
546
547 if (!io_ctl->cur) {
548 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
549 if (ret)
550 return ret;
551 }
552
553 e = io_ctl->cur;
554 entry->offset = le64_to_cpu(e->offset);
555 entry->bytes = le64_to_cpu(e->bytes);
556 *type = e->type;
557 io_ctl->cur += sizeof(struct btrfs_free_space_entry);
558 io_ctl->size -= sizeof(struct btrfs_free_space_entry);
559
560 if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
561 return 0;
562
563 io_ctl_unmap_page(io_ctl);
564
565 return 0;
566}
567
568static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
569 struct btrfs_free_space *entry)
570{
571 int ret;
572
573 ret = io_ctl_check_crc(io_ctl, io_ctl->index);
574 if (ret)
575 return ret;
576
577 memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
578 io_ctl_unmap_page(io_ctl);
579
580 return 0;
581}
582
245int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, 583int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
246 struct btrfs_free_space_ctl *ctl, 584 struct btrfs_free_space_ctl *ctl,
247 struct btrfs_path *path, u64 offset) 585 struct btrfs_path *path, u64 offset)
248{ 586{
249 struct btrfs_free_space_header *header; 587 struct btrfs_free_space_header *header;
250 struct extent_buffer *leaf; 588 struct extent_buffer *leaf;
251 struct page *page; 589 struct io_ctl io_ctl;
252 struct btrfs_key key; 590 struct btrfs_key key;
591 struct btrfs_free_space *e, *n;
253 struct list_head bitmaps; 592 struct list_head bitmaps;
254 u64 num_entries; 593 u64 num_entries;
255 u64 num_bitmaps; 594 u64 num_bitmaps;
256 u64 generation; 595 u64 generation;
257 pgoff_t index = 0; 596 u8 type;
258 int ret = 0; 597 int ret = 0;
259 598
260 INIT_LIST_HEAD(&bitmaps); 599 INIT_LIST_HEAD(&bitmaps);
261 600
262 /* Nothing in the space cache, goodbye */ 601 /* Nothing in the space cache, goodbye */
263 if (!i_size_read(inode)) 602 if (!i_size_read(inode))
264 goto out; 603 return 0;
265 604
266 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 605 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
267 key.offset = offset; 606 key.offset = offset;
@@ -269,11 +608,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
269 608
270 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); 609 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
271 if (ret < 0) 610 if (ret < 0)
272 goto out; 611 return 0;
273 else if (ret > 0) { 612 else if (ret > 0) {
274 btrfs_release_path(path); 613 btrfs_release_path(path);
275 ret = 0; 614 return 0;
276 goto out;
277 } 615 }
278 616
279 ret = -1; 617 ret = -1;
@@ -291,169 +629,102 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
291 " not match free space cache generation (%llu)\n", 629 " not match free space cache generation (%llu)\n",
292 (unsigned long long)BTRFS_I(inode)->generation, 630 (unsigned long long)BTRFS_I(inode)->generation,
293 (unsigned long long)generation); 631 (unsigned long long)generation);
294 goto out; 632 return 0;
295 } 633 }
296 634
297 if (!num_entries) 635 if (!num_entries)
298 goto out; 636 return 0;
299 637
638 io_ctl_init(&io_ctl, inode, root);
300 ret = readahead_cache(inode); 639 ret = readahead_cache(inode);
301 if (ret) 640 if (ret)
302 goto out; 641 goto out;
303 642
304 while (1) { 643 ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
305 struct btrfs_free_space_entry *entry; 644 if (ret)
306 struct btrfs_free_space *e; 645 goto out;
307 void *addr;
308 unsigned long offset = 0;
309 int need_loop = 0;
310 646
311 if (!num_entries && !num_bitmaps) 647 ret = io_ctl_check_crc(&io_ctl, 0);
312 break; 648 if (ret)
649 goto free_cache;
650
651 ret = io_ctl_check_generation(&io_ctl, generation);
652 if (ret)
653 goto free_cache;
313 654
314 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); 655 while (num_entries) {
315 if (!page) 656 e = kmem_cache_zalloc(btrfs_free_space_cachep,
657 GFP_NOFS);
658 if (!e)
316 goto free_cache; 659 goto free_cache;
317 660
318 if (!PageUptodate(page)) { 661 ret = io_ctl_read_entry(&io_ctl, e, &type);
319 btrfs_readpage(NULL, page); 662 if (ret) {
320 lock_page(page); 663 kmem_cache_free(btrfs_free_space_cachep, e);
321 if (!PageUptodate(page)) { 664 goto free_cache;
322 unlock_page(page);
323 page_cache_release(page);
324 printk(KERN_ERR "btrfs: error reading free "
325 "space cache\n");
326 goto free_cache;
327 }
328 } 665 }
329 addr = kmap(page);
330 666
331 if (index == 0) { 667 if (!e->bytes) {
332 u64 *gen; 668 kmem_cache_free(btrfs_free_space_cachep, e);
669 goto free_cache;
670 }
333 671
334 /* 672 if (type == BTRFS_FREE_SPACE_EXTENT) {
335 * We put a bogus crc in the front of the first page in 673 spin_lock(&ctl->tree_lock);
336 * case old kernels try to mount a fs with the new 674 ret = link_free_space(ctl, e);
337 * format to make sure they discard the cache. 675 spin_unlock(&ctl->tree_lock);
338 */ 676 if (ret) {
339 addr += sizeof(u64); 677 printk(KERN_ERR "Duplicate entries in "
340 offset += sizeof(u64); 678 "free space cache, dumping\n");
341 679 kmem_cache_free(btrfs_free_space_cachep, e);
342 gen = addr;
343 if (*gen != BTRFS_I(inode)->generation) {
344 printk(KERN_ERR "btrfs: space cache generation"
345 " (%llu) does not match inode (%llu)\n",
346 (unsigned long long)*gen,
347 (unsigned long long)
348 BTRFS_I(inode)->generation);
349 kunmap(page);
350 unlock_page(page);
351 page_cache_release(page);
352 goto free_cache; 680 goto free_cache;
353 } 681 }
354 addr += sizeof(u64); 682 } else {
355 offset += sizeof(u64); 683 BUG_ON(!num_bitmaps);
356 } 684 num_bitmaps--;
357 entry = addr; 685 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
358 686 if (!e->bitmap) {
359 while (1) { 687 kmem_cache_free(
360 if (!num_entries) 688 btrfs_free_space_cachep, e);
361 break;
362
363 need_loop = 1;
364 e = kmem_cache_zalloc(btrfs_free_space_cachep,
365 GFP_NOFS);
366 if (!e) {
367 kunmap(page);
368 unlock_page(page);
369 page_cache_release(page);
370 goto free_cache; 689 goto free_cache;
371 } 690 }
372 691 spin_lock(&ctl->tree_lock);
373 e->offset = le64_to_cpu(entry->offset); 692 ret = link_free_space(ctl, e);
374 e->bytes = le64_to_cpu(entry->bytes); 693 ctl->total_bitmaps++;
375 if (!e->bytes) { 694 ctl->op->recalc_thresholds(ctl);
376 kunmap(page); 695 spin_unlock(&ctl->tree_lock);
696 if (ret) {
697 printk(KERN_ERR "Duplicate entries in "
698 "free space cache, dumping\n");
377 kmem_cache_free(btrfs_free_space_cachep, e); 699 kmem_cache_free(btrfs_free_space_cachep, e);
378 unlock_page(page);
379 page_cache_release(page);
380 goto free_cache; 700 goto free_cache;
381 } 701 }
382 702 list_add_tail(&e->list, &bitmaps);
383 if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
384 spin_lock(&ctl->tree_lock);
385 ret = link_free_space(ctl, e);
386 spin_unlock(&ctl->tree_lock);
387 if (ret) {
388 printk(KERN_ERR "Duplicate entries in "
389 "free space cache, dumping\n");
390 kunmap(page);
391 unlock_page(page);
392 page_cache_release(page);
393 goto free_cache;
394 }
395 } else {
396 e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
397 if (!e->bitmap) {
398 kunmap(page);
399 kmem_cache_free(
400 btrfs_free_space_cachep, e);
401 unlock_page(page);
402 page_cache_release(page);
403 goto free_cache;
404 }
405 spin_lock(&ctl->tree_lock);
406 ret = link_free_space(ctl, e);
407 ctl->total_bitmaps++;
408 ctl->op->recalc_thresholds(ctl);
409 spin_unlock(&ctl->tree_lock);
410 if (ret) {
411 printk(KERN_ERR "Duplicate entries in "
412 "free space cache, dumping\n");
413 kunmap(page);
414 unlock_page(page);
415 page_cache_release(page);
416 goto free_cache;
417 }
418 list_add_tail(&e->list, &bitmaps);
419 }
420
421 num_entries--;
422 offset += sizeof(struct btrfs_free_space_entry);
423 if (offset + sizeof(struct btrfs_free_space_entry) >=
424 PAGE_CACHE_SIZE)
425 break;
426 entry++;
427 } 703 }
428 704
429 /* 705 num_entries--;
430 * We read an entry out of this page, we need to move on to the 706 }
431 * next page.
432 */
433 if (need_loop) {
434 kunmap(page);
435 goto next;
436 }
437 707
438 /* 708 io_ctl_unmap_page(&io_ctl);
439 * We add the bitmaps at the end of the entries in order that 709
440 * the bitmap entries are added to the cache. 710 /*
441 */ 711 * We add the bitmaps at the end of the entries in order that
442 e = list_entry(bitmaps.next, struct btrfs_free_space, list); 712 * the bitmap entries are added to the cache.
713 */
714 list_for_each_entry_safe(e, n, &bitmaps, list) {
443 list_del_init(&e->list); 715 list_del_init(&e->list);
444 memcpy(e->bitmap, addr, PAGE_CACHE_SIZE); 716 ret = io_ctl_read_bitmap(&io_ctl, e);
445 kunmap(page); 717 if (ret)
446 num_bitmaps--; 718 goto free_cache;
447next:
448 unlock_page(page);
449 page_cache_release(page);
450 index++;
451 } 719 }
452 720
721 io_ctl_drop_pages(&io_ctl);
453 ret = 1; 722 ret = 1;
454out: 723out:
724 io_ctl_free(&io_ctl);
455 return ret; 725 return ret;
456free_cache: 726free_cache:
727 io_ctl_drop_pages(&io_ctl);
457 __btrfs_remove_free_space_cache(ctl); 728 __btrfs_remove_free_space_cache(ctl);
458 goto out; 729 goto out;
459} 730}
@@ -465,7 +736,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
465 struct btrfs_root *root = fs_info->tree_root; 736 struct btrfs_root *root = fs_info->tree_root;
466 struct inode *inode; 737 struct inode *inode;
467 struct btrfs_path *path; 738 struct btrfs_path *path;
468 int ret; 739 int ret = 0;
469 bool matched; 740 bool matched;
470 u64 used = btrfs_block_group_used(&block_group->item); 741 u64 used = btrfs_block_group_used(&block_group->item);
471 742
@@ -497,6 +768,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
497 return 0; 768 return 0;
498 } 769 }
499 770
771 /* We may have converted the inode and made the cache invalid. */
772 spin_lock(&block_group->lock);
773 if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
774 spin_unlock(&block_group->lock);
775 goto out;
776 }
777 spin_unlock(&block_group->lock);
778
500 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl, 779 ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
501 path, block_group->key.objectid); 780 path, block_group->key.objectid);
502 btrfs_free_path(path); 781 btrfs_free_path(path);
@@ -530,6 +809,19 @@ out:
530 return ret; 809 return ret;
531} 810}
532 811
812/**
813 * __btrfs_write_out_cache - write out cached info to an inode
814 * @root - the root the inode belongs to
815 * @ctl - the free space cache we are going to write out
816 * @block_group - the block_group for this cache if it belongs to a block_group
817 * @trans - the trans handle
818 * @path - the path to use
819 * @offset - the offset for the key we'll insert
820 *
821 * This function writes out a free space cache struct to disk for quick recovery
822 * on mount. This will return 0 if it was successfull in writing the cache out,
823 * and -1 if it was not.
824 */
533int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, 825int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
534 struct btrfs_free_space_ctl *ctl, 826 struct btrfs_free_space_ctl *ctl,
535 struct btrfs_block_group_cache *block_group, 827 struct btrfs_block_group_cache *block_group,
@@ -540,42 +832,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
540 struct extent_buffer *leaf; 832 struct extent_buffer *leaf;
541 struct rb_node *node; 833 struct rb_node *node;
542 struct list_head *pos, *n; 834 struct list_head *pos, *n;
543 struct page **pages;
544 struct page *page;
545 struct extent_state *cached_state = NULL; 835 struct extent_state *cached_state = NULL;
546 struct btrfs_free_cluster *cluster = NULL; 836 struct btrfs_free_cluster *cluster = NULL;
547 struct extent_io_tree *unpin = NULL; 837 struct extent_io_tree *unpin = NULL;
838 struct io_ctl io_ctl;
548 struct list_head bitmap_list; 839 struct list_head bitmap_list;
549 struct btrfs_key key; 840 struct btrfs_key key;
550 u64 start, end, len; 841 u64 start, end, len;
551 u64 bytes = 0;
552 u32 crc = ~(u32)0;
553 int index = 0, num_pages = 0;
554 int entries = 0; 842 int entries = 0;
555 int bitmaps = 0; 843 int bitmaps = 0;
556 int ret = -1; 844 int ret;
557 bool next_page = false; 845 int err = -1;
558 bool out_of_space = false;
559 846
560 INIT_LIST_HEAD(&bitmap_list); 847 INIT_LIST_HEAD(&bitmap_list);
561 848
562 node = rb_first(&ctl->free_space_offset);
563 if (!node)
564 return 0;
565
566 if (!i_size_read(inode)) 849 if (!i_size_read(inode))
567 return -1; 850 return -1;
568 851
569 num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> 852 io_ctl_init(&io_ctl, inode, root);
570 PAGE_CACHE_SHIFT;
571
572 filemap_write_and_wait(inode->i_mapping);
573 btrfs_wait_ordered_range(inode, inode->i_size &
574 ~(root->sectorsize - 1), (u64)-1);
575
576 pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
577 if (!pages)
578 return -1;
579 853
580 /* Get the cluster for this block_group if it exists */ 854 /* Get the cluster for this block_group if it exists */
581 if (block_group && !list_empty(&block_group->cluster_list)) 855 if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +863,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
589 */ 863 */
590 unpin = root->fs_info->pinned_extents; 864 unpin = root->fs_info->pinned_extents;
591 865
592 /* 866 /* Lock all pages first so we can lock the extent safely. */
593 * Lock all pages first so we can lock the extent safely. 867 io_ctl_prepare_pages(&io_ctl, inode, 0);
594 *
595 * NOTE: Because we hold the ref the entire time we're going to write to
596 * the page find_get_page should never fail, so we don't do a check
597 * after find_get_page at this point. Just putting this here so people
598 * know and don't freak out.
599 */
600 while (index < num_pages) {
601 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
602 if (!page) {
603 int i;
604 868
605 for (i = 0; i < num_pages; i++) {
606 unlock_page(pages[i]);
607 page_cache_release(pages[i]);
608 }
609 goto out;
610 }
611 pages[index] = page;
612 index++;
613 }
614
615 index = 0;
616 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, 869 lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
617 0, &cached_state, GFP_NOFS); 870 0, &cached_state, GFP_NOFS);
618 871
@@ -623,189 +876,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
623 if (block_group) 876 if (block_group)
624 start = block_group->key.objectid; 877 start = block_group->key.objectid;
625 878
626 /* Write out the extent entries */ 879 node = rb_first(&ctl->free_space_offset);
627 do { 880 if (!node && cluster) {
628 struct btrfs_free_space_entry *entry; 881 node = rb_first(&cluster->root);
629 void *addr, *orig; 882 cluster = NULL;
630 unsigned long offset = 0; 883 }
631 884
632 next_page = false; 885 /* Make sure we can fit our crcs into the first page */
886 if (io_ctl.check_crcs &&
887 (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
888 WARN_ON(1);
889 goto out_nospc;
890 }
633 891
634 if (index >= num_pages) { 892 io_ctl_set_generation(&io_ctl, trans->transid);
635 out_of_space = true;
636 break;
637 }
638 893
639 page = pages[index]; 894 /* Write out the extent entries */
895 while (node) {
896 struct btrfs_free_space *e;
640 897
641 orig = addr = kmap(page); 898 e = rb_entry(node, struct btrfs_free_space, offset_index);
642 if (index == 0) { 899 entries++;
643 u64 *gen;
644 900
645 /* 901 ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
646 * We're going to put in a bogus crc for this page to 902 e->bitmap);
647 * make sure that old kernels who aren't aware of this 903 if (ret)
648 * format will be sure to discard the cache. 904 goto out_nospc;
649 */
650 addr += sizeof(u64);
651 offset += sizeof(u64);
652 905
653 gen = addr; 906 if (e->bitmap) {
654 *gen = trans->transid; 907 list_add_tail(&e->list, &bitmap_list);
655 addr += sizeof(u64); 908 bitmaps++;
656 offset += sizeof(u64);
657 } 909 }
658 entry = addr; 910 node = rb_next(node);
659 911 if (!node && cluster) {
660 memset(addr, 0, PAGE_CACHE_SIZE - offset); 912 node = rb_first(&cluster->root);
661 while (node && !next_page) { 913 cluster = NULL;
662 struct btrfs_free_space *e;
663
664 e = rb_entry(node, struct btrfs_free_space, offset_index);
665 entries++;
666
667 entry->offset = cpu_to_le64(e->offset);
668 entry->bytes = cpu_to_le64(e->bytes);
669 if (e->bitmap) {
670 entry->type = BTRFS_FREE_SPACE_BITMAP;
671 list_add_tail(&e->list, &bitmap_list);
672 bitmaps++;
673 } else {
674 entry->type = BTRFS_FREE_SPACE_EXTENT;
675 }
676 node = rb_next(node);
677 if (!node && cluster) {
678 node = rb_first(&cluster->root);
679 cluster = NULL;
680 }
681 offset += sizeof(struct btrfs_free_space_entry);
682 if (offset + sizeof(struct btrfs_free_space_entry) >=
683 PAGE_CACHE_SIZE)
684 next_page = true;
685 entry++;
686 } 914 }
915 }
687 916
688 /* 917 /*
689 * We want to add any pinned extents to our free space cache 918 * We want to add any pinned extents to our free space cache
690 * so we don't leak the space 919 * so we don't leak the space
691 */ 920 */
692 while (block_group && !next_page && 921 while (block_group && (start < block_group->key.objectid +
693 (start < block_group->key.objectid + 922 block_group->key.offset)) {
694 block_group->key.offset)) { 923 ret = find_first_extent_bit(unpin, start, &start, &end,
695 ret = find_first_extent_bit(unpin, start, &start, &end, 924 EXTENT_DIRTY);
696 EXTENT_DIRTY); 925 if (ret) {
697 if (ret) { 926 ret = 0;
698 ret = 0; 927 break;
699 break;
700 }
701
702 /* This pinned extent is out of our range */
703 if (start >= block_group->key.objectid +
704 block_group->key.offset)
705 break;
706
707 len = block_group->key.objectid +
708 block_group->key.offset - start;
709 len = min(len, end + 1 - start);
710
711 entries++;
712 entry->offset = cpu_to_le64(start);
713 entry->bytes = cpu_to_le64(len);
714 entry->type = BTRFS_FREE_SPACE_EXTENT;
715
716 start = end + 1;
717 offset += sizeof(struct btrfs_free_space_entry);
718 if (offset + sizeof(struct btrfs_free_space_entry) >=
719 PAGE_CACHE_SIZE)
720 next_page = true;
721 entry++;
722 } 928 }
723 929
724 /* Generate bogus crc value */ 930 /* This pinned extent is out of our range */
725 if (index == 0) { 931 if (start >= block_group->key.objectid +
726 u32 *tmp; 932 block_group->key.offset)
727 crc = btrfs_csum_data(root, orig + sizeof(u64), crc, 933 break;
728 PAGE_CACHE_SIZE - sizeof(u64));
729 btrfs_csum_final(crc, (char *)&crc);
730 crc++;
731 tmp = orig;
732 *tmp = crc;
733 }
734 934
735 kunmap(page); 935 len = block_group->key.objectid +
936 block_group->key.offset - start;
937 len = min(len, end + 1 - start);
736 938
737 bytes += PAGE_CACHE_SIZE; 939 entries++;
940 ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
941 if (ret)
942 goto out_nospc;
738 943
739 index++; 944 start = end + 1;
740 } while (node || next_page); 945 }
741 946
742 /* Write out the bitmaps */ 947 /* Write out the bitmaps */
743 list_for_each_safe(pos, n, &bitmap_list) { 948 list_for_each_safe(pos, n, &bitmap_list) {
744 void *addr;
745 struct btrfs_free_space *entry = 949 struct btrfs_free_space *entry =
746 list_entry(pos, struct btrfs_free_space, list); 950 list_entry(pos, struct btrfs_free_space, list);
747 951
748 if (index >= num_pages) { 952 ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
749 out_of_space = true; 953 if (ret)
750 break; 954 goto out_nospc;
751 }
752 page = pages[index];
753
754 addr = kmap(page);
755 memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
756 kunmap(page);
757 bytes += PAGE_CACHE_SIZE;
758
759 list_del_init(&entry->list); 955 list_del_init(&entry->list);
760 index++;
761 }
762
763 if (out_of_space) {
764 btrfs_drop_pages(pages, num_pages);
765 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
766 i_size_read(inode) - 1, &cached_state,
767 GFP_NOFS);
768 ret = 0;
769 goto out;
770 } 956 }
771 957
772 /* Zero out the rest of the pages just to make sure */ 958 /* Zero out the rest of the pages just to make sure */
773 while (index < num_pages) { 959 io_ctl_zero_remaining_pages(&io_ctl);
774 void *addr;
775 960
776 page = pages[index]; 961 ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
777 addr = kmap(page); 962 0, i_size_read(inode), &cached_state);
778 memset(addr, 0, PAGE_CACHE_SIZE); 963 io_ctl_drop_pages(&io_ctl);
779 kunmap(page);
780 bytes += PAGE_CACHE_SIZE;
781 index++;
782 }
783
784 ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
785 bytes, &cached_state);
786 btrfs_drop_pages(pages, num_pages);
787 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0, 964 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
788 i_size_read(inode) - 1, &cached_state, GFP_NOFS); 965 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
789 966
790 if (ret) { 967 if (ret)
791 ret = 0;
792 goto out; 968 goto out;
793 }
794 969
795 BTRFS_I(inode)->generation = trans->transid;
796 970
797 filemap_write_and_wait(inode->i_mapping); 971 ret = filemap_write_and_wait(inode->i_mapping);
972 if (ret)
973 goto out;
798 974
799 key.objectid = BTRFS_FREE_SPACE_OBJECTID; 975 key.objectid = BTRFS_FREE_SPACE_OBJECTID;
800 key.offset = offset; 976 key.offset = offset;
801 key.type = 0; 977 key.type = 0;
802 978
803 ret = btrfs_search_slot(trans, root, &key, path, 1, 1); 979 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
804 if (ret < 0) { 980 if (ret < 0) {
805 ret = -1; 981 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
806 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 982 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
807 EXTENT_DIRTY | EXTENT_DELALLOC | 983 GFP_NOFS);
808 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
809 goto out; 984 goto out;
810 } 985 }
811 leaf = path->nodes[0]; 986 leaf = path->nodes[0];
@@ -816,15 +991,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
816 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); 991 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
817 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || 992 if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
818 found_key.offset != offset) { 993 found_key.offset != offset) {
819 ret = -1; 994 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
820 clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1, 995 inode->i_size - 1,
821 EXTENT_DIRTY | EXTENT_DELALLOC | 996 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
822 EXTENT_DO_ACCOUNTING, 0, 0, NULL, 997 NULL, GFP_NOFS);
823 GFP_NOFS);
824 btrfs_release_path(path); 998 btrfs_release_path(path);
825 goto out; 999 goto out;
826 } 1000 }
827 } 1001 }
1002
1003 BTRFS_I(inode)->generation = trans->transid;
828 header = btrfs_item_ptr(leaf, path->slots[0], 1004 header = btrfs_item_ptr(leaf, path->slots[0],
829 struct btrfs_free_space_header); 1005 struct btrfs_free_space_header);
830 btrfs_set_free_space_entries(leaf, header, entries); 1006 btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1009,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
833 btrfs_mark_buffer_dirty(leaf); 1009 btrfs_mark_buffer_dirty(leaf);
834 btrfs_release_path(path); 1010 btrfs_release_path(path);
835 1011
836 ret = 1; 1012 err = 0;
837
838out: 1013out:
839 kfree(pages); 1014 io_ctl_free(&io_ctl);
840 if (ret != 1) { 1015 if (err) {
841 invalidate_inode_pages2_range(inode->i_mapping, 0, index); 1016 invalidate_inode_pages2(inode->i_mapping);
842 BTRFS_I(inode)->generation = 0; 1017 BTRFS_I(inode)->generation = 0;
843 } 1018 }
844 btrfs_update_inode(trans, root, inode); 1019 btrfs_update_inode(trans, root, inode);
845 return ret; 1020 return err;
1021
1022out_nospc:
1023 list_for_each_safe(pos, n, &bitmap_list) {
1024 struct btrfs_free_space *entry =
1025 list_entry(pos, struct btrfs_free_space, list);
1026 list_del_init(&entry->list);
1027 }
1028 io_ctl_drop_pages(&io_ctl);
1029 unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1030 i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1031 goto out;
846} 1032}
847 1033
848int btrfs_write_out_cache(struct btrfs_root *root, 1034int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1055,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
869 1055
870 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans, 1056 ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
871 path, block_group->key.objectid); 1057 path, block_group->key.objectid);
872 if (ret < 0) { 1058 if (ret) {
873 spin_lock(&block_group->lock); 1059 spin_lock(&block_group->lock);
874 block_group->disk_cache_state = BTRFS_DC_ERROR; 1060 block_group->disk_cache_state = BTRFS_DC_ERROR;
875 spin_unlock(&block_group->lock); 1061 spin_unlock(&block_group->lock);
876 ret = 0; 1062 ret = 0;
877 1063#ifdef DEBUG
878 printk(KERN_ERR "btrfs: failed to write free space cace " 1064 printk(KERN_ERR "btrfs: failed to write free space cace "
879 "for block group %llu\n", block_group->key.objectid); 1065 "for block group %llu\n", block_group->key.objectid);
1066#endif
880 } 1067 }
881 1068
882 iput(inode); 1069 iput(inode);
@@ -1283,6 +1470,7 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1283{ 1470{
1284 info->offset = offset_to_bitmap(ctl, offset); 1471 info->offset = offset_to_bitmap(ctl, offset);
1285 info->bytes = 0; 1472 info->bytes = 0;
1473 INIT_LIST_HEAD(&info->list);
1286 link_free_space(ctl, info); 1474 link_free_space(ctl, info);
1287 ctl->total_bitmaps++; 1475 ctl->total_bitmaps++;
1288 1476
@@ -1662,7 +1850,13 @@ again:
1662 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1850 info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1663 1, 0); 1851 1, 0);
1664 if (!info) { 1852 if (!info) {
1665 WARN_ON(1); 1853 /* the tree logging code might be calling us before we
1854 * have fully loaded the free space rbtree for this
1855 * block group. So it is possible the entry won't
1856 * be in the rbtree yet at all. The caching code
1857 * will make sure not to put it in the rbtree if
1858 * the logging code has pinned it.
1859 */
1666 goto out_lock; 1860 goto out_lock;
1667 } 1861 }
1668 } 1862 }
@@ -1701,6 +1895,7 @@ again:
1701 ctl->total_bitmaps--; 1895 ctl->total_bitmaps--;
1702 } 1896 }
1703 kmem_cache_free(btrfs_free_space_cachep, info); 1897 kmem_cache_free(btrfs_free_space_cachep, info);
1898 ret = 0;
1704 goto out_lock; 1899 goto out_lock;
1705 } 1900 }
1706 1901
@@ -1708,7 +1903,8 @@ again:
1708 unlink_free_space(ctl, info); 1903 unlink_free_space(ctl, info);
1709 info->offset += bytes; 1904 info->offset += bytes;
1710 info->bytes -= bytes; 1905 info->bytes -= bytes;
1711 link_free_space(ctl, info); 1906 ret = link_free_space(ctl, info);
1907 WARN_ON(ret);
1712 goto out_lock; 1908 goto out_lock;
1713 } 1909 }
1714 1910
@@ -2124,6 +2320,7 @@ again:
2124 2320
2125 if (!found) { 2321 if (!found) {
2126 start = i; 2322 start = i;
2323 cluster->max_size = 0;
2127 found = true; 2324 found = true;
2128 } 2325 }
2129 2326
@@ -2267,16 +2464,23 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2267{ 2464{
2268 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2465 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2269 struct btrfs_free_space *entry; 2466 struct btrfs_free_space *entry;
2270 struct rb_node *node;
2271 int ret = -ENOSPC; 2467 int ret = -ENOSPC;
2468 u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2272 2469
2273 if (ctl->total_bitmaps == 0) 2470 if (ctl->total_bitmaps == 0)
2274 return -ENOSPC; 2471 return -ENOSPC;
2275 2472
2276 /* 2473 /*
2277 * First check our cached list of bitmaps and see if there is an entry 2474 * The bitmap that covers offset won't be in the list unless offset
2278 * here that will work. 2475 * is just its start offset.
2279 */ 2476 */
2477 entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2478 if (entry->offset != bitmap_offset) {
2479 entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2480 if (entry && list_empty(&entry->list))
2481 list_add(&entry->list, bitmaps);
2482 }
2483
2280 list_for_each_entry(entry, bitmaps, list) { 2484 list_for_each_entry(entry, bitmaps, list) {
2281 if (entry->bytes < min_bytes) 2485 if (entry->bytes < min_bytes)
2282 continue; 2486 continue;
@@ -2287,38 +2491,10 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2287 } 2491 }
2288 2492
2289 /* 2493 /*
2290 * If we do have entries on our list and we are here then we didn't find 2494 * The bitmaps list has all the bitmaps that record free space
2291 * anything, so go ahead and get the next entry after the last entry in 2495 * starting after offset, so no more search is required.
2292 * this list and start the search from there.
2293 */ 2496 */
2294 if (!list_empty(bitmaps)) { 2497 return -ENOSPC;
2295 entry = list_entry(bitmaps->prev, struct btrfs_free_space,
2296 list);
2297 node = rb_next(&entry->offset_index);
2298 if (!node)
2299 return -ENOSPC;
2300 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2301 goto search;
2302 }
2303
2304 entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
2305 if (!entry)
2306 return -ENOSPC;
2307
2308search:
2309 node = &entry->offset_index;
2310 do {
2311 entry = rb_entry(node, struct btrfs_free_space, offset_index);
2312 node = rb_next(&entry->offset_index);
2313 if (!entry->bitmap)
2314 continue;
2315 if (entry->bytes < min_bytes)
2316 continue;
2317 ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2318 bytes, min_bytes);
2319 } while (ret && node);
2320
2321 return ret;
2322} 2498}
2323 2499
2324/* 2500/*
@@ -2336,8 +2512,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2336 u64 offset, u64 bytes, u64 empty_size) 2512 u64 offset, u64 bytes, u64 empty_size)
2337{ 2513{
2338 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; 2514 struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2339 struct list_head bitmaps;
2340 struct btrfs_free_space *entry, *tmp; 2515 struct btrfs_free_space *entry, *tmp;
2516 LIST_HEAD(bitmaps);
2341 u64 min_bytes; 2517 u64 min_bytes;
2342 int ret; 2518 int ret;
2343 2519
@@ -2376,7 +2552,6 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
2376 goto out; 2552 goto out;
2377 } 2553 }
2378 2554
2379 INIT_LIST_HEAD(&bitmaps);
2380 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset, 2555 ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2381 bytes, min_bytes); 2556 bytes, min_bytes);
2382 if (ret) 2557 if (ret)
@@ -2472,9 +2647,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2472 spin_unlock(&ctl->tree_lock); 2647 spin_unlock(&ctl->tree_lock);
2473 2648
2474 if (bytes >= minlen) { 2649 if (bytes >= minlen) {
2475 int update_ret; 2650 struct btrfs_space_info *space_info;
2476 update_ret = btrfs_update_reserved_bytes(block_group, 2651 int update = 0;
2477 bytes, 1, 1); 2652
2653 space_info = block_group->space_info;
2654 spin_lock(&space_info->lock);
2655 spin_lock(&block_group->lock);
2656 if (!block_group->ro) {
2657 block_group->reserved += bytes;
2658 space_info->bytes_reserved += bytes;
2659 update = 1;
2660 }
2661 spin_unlock(&block_group->lock);
2662 spin_unlock(&space_info->lock);
2478 2663
2479 ret = btrfs_error_discard_extent(fs_info->extent_root, 2664 ret = btrfs_error_discard_extent(fs_info->extent_root,
2480 start, 2665 start,
@@ -2482,9 +2667,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2482 &actually_trimmed); 2667 &actually_trimmed);
2483 2668
2484 btrfs_add_free_space(block_group, start, bytes); 2669 btrfs_add_free_space(block_group, start, bytes);
2485 if (!update_ret) 2670 if (update) {
2486 btrfs_update_reserved_bytes(block_group, 2671 spin_lock(&space_info->lock);
2487 bytes, 0, 1); 2672 spin_lock(&block_group->lock);
2673 if (block_group->ro)
2674 space_info->bytes_readonly += bytes;
2675 block_group->reserved -= bytes;
2676 space_info->bytes_reserved -= bytes;
2677 spin_unlock(&space_info->lock);
2678 spin_unlock(&block_group->lock);
2679 }
2488 2680
2489 if (ret) 2681 if (ret)
2490 break; 2682 break;
@@ -2643,9 +2835,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
2643 return 0; 2835 return 0;
2644 2836
2645 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0); 2837 ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
2646 if (ret < 0) 2838 if (ret) {
2839 btrfs_delalloc_release_metadata(inode, inode->i_size);
2840#ifdef DEBUG
2647 printk(KERN_ERR "btrfs: failed to write free ino cache " 2841 printk(KERN_ERR "btrfs: failed to write free ino cache "
2648 "for root %llu\n", root->root_key.objectid); 2842 "for root %llu\n", root->root_key.objectid);
2843#endif
2844 }
2649 2845
2650 iput(inode); 2846 iput(inode);
2651 return ret; 2847 return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..f8962a957d65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -398,6 +398,8 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; 398 struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
399 struct btrfs_path *path; 399 struct btrfs_path *path;
400 struct inode *inode; 400 struct inode *inode;
401 struct btrfs_block_rsv *rsv;
402 u64 num_bytes;
401 u64 alloc_hint = 0; 403 u64 alloc_hint = 0;
402 int ret; 404 int ret;
403 int prealloc; 405 int prealloc;
@@ -421,11 +423,26 @@ int btrfs_save_ino_cache(struct btrfs_root *root,
421 if (!path) 423 if (!path)
422 return -ENOMEM; 424 return -ENOMEM;
423 425
426 rsv = trans->block_rsv;
427 trans->block_rsv = &root->fs_info->trans_block_rsv;
428
429 num_bytes = trans->bytes_reserved;
430 /*
431 * 1 item for inode item insertion if need
432 * 3 items for inode item update (in the worst case)
433 * 1 item for free space object
434 * 3 items for pre-allocation
435 */
436 trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
437 ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
438 trans->bytes_reserved);
439 if (ret)
440 goto out;
424again: 441again:
425 inode = lookup_free_ino_inode(root, path); 442 inode = lookup_free_ino_inode(root, path);
426 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { 443 if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
427 ret = PTR_ERR(inode); 444 ret = PTR_ERR(inode);
428 goto out; 445 goto out_release;
429 } 446 }
430 447
431 if (IS_ERR(inode)) { 448 if (IS_ERR(inode)) {
@@ -434,7 +451,7 @@ again:
434 451
435 ret = create_free_ino_inode(root, trans, path); 452 ret = create_free_ino_inode(root, trans, path);
436 if (ret) 453 if (ret)
437 goto out; 454 goto out_release;
438 goto again; 455 goto again;
439 } 456 }
440 457
@@ -465,21 +482,26 @@ again:
465 /* Just to make sure we have enough space */ 482 /* Just to make sure we have enough space */
466 prealloc += 8 * PAGE_CACHE_SIZE; 483 prealloc += 8 * PAGE_CACHE_SIZE;
467 484
468 ret = btrfs_check_data_free_space(inode, prealloc); 485 ret = btrfs_delalloc_reserve_space(inode, prealloc);
469 if (ret) 486 if (ret)
470 goto out_put; 487 goto out_put;
471 488
472 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, 489 ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
473 prealloc, prealloc, &alloc_hint); 490 prealloc, prealloc, &alloc_hint);
474 if (ret) 491 if (ret) {
492 btrfs_delalloc_release_space(inode, prealloc);
475 goto out_put; 493 goto out_put;
494 }
476 btrfs_free_reserved_data_space(inode, prealloc); 495 btrfs_free_reserved_data_space(inode, prealloc);
477 496
497 ret = btrfs_write_out_ino_cache(root, trans, path);
478out_put: 498out_put:
479 iput(inode); 499 iput(inode);
500out_release:
501 btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
480out: 502out:
481 if (ret == 0) 503 trans->block_rsv = rsv;
482 ret = btrfs_write_out_ino_cache(root, trans, path); 504 trans->bytes_reserved = num_bytes;
483 505
484 btrfs_free_path(path); 506 btrfs_free_path(path);
485 return ret; 507 return ret;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..2c984f7d4c2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
45#include "btrfs_inode.h" 45#include "btrfs_inode.h"
46#include "ioctl.h" 46#include "ioctl.h"
47#include "print-tree.h" 47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 48#include "ordered-data.h"
50#include "xattr.h" 49#include "xattr.h"
51#include "tree-log.h" 50#include "tree-log.h"
51#include "volumes.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h" 54#include "free-space-cache.h"
@@ -93,6 +93,8 @@ static noinline int cow_file_range(struct inode *inode,
93 struct page *locked_page, 93 struct page *locked_page,
94 u64 start, u64 end, int *page_started, 94 u64 start, u64 end, int *page_started,
95 unsigned long *nr_written, int unlock); 95 unsigned long *nr_written, int unlock);
96static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
97 struct btrfs_root *root, struct inode *inode);
96 98
97static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, 99static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
98 struct inode *inode, struct inode *dir, 100 struct inode *inode, struct inode *dir,
@@ -393,7 +395,10 @@ again:
393 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { 395 (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
394 WARN_ON(pages); 396 WARN_ON(pages);
395 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 397 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
396 BUG_ON(!pages); 398 if (!pages) {
399 /* just bail out to the uncompressed code */
400 goto cont;
401 }
397 402
398 if (BTRFS_I(inode)->force_compress) 403 if (BTRFS_I(inode)->force_compress)
399 compress_type = BTRFS_I(inode)->force_compress; 404 compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +429,7 @@ again:
424 will_compress = 1; 429 will_compress = 1;
425 } 430 }
426 } 431 }
432cont:
427 if (start == 0) { 433 if (start == 0) {
428 trans = btrfs_join_transaction(root); 434 trans = btrfs_join_transaction(root);
429 BUG_ON(IS_ERR(trans)); 435 BUG_ON(IS_ERR(trans));
@@ -820,7 +826,7 @@ static noinline int cow_file_range(struct inode *inode,
820 } 826 }
821 827
822 BUG_ON(disk_num_bytes > 828 BUG_ON(disk_num_bytes >
823 btrfs_super_total_bytes(&root->fs_info->super_copy)); 829 btrfs_super_total_bytes(root->fs_info->super_copy));
824 830
825 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes); 831 alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
826 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); 832 btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1737,7 +1743,7 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1737 trans = btrfs_join_transaction(root); 1743 trans = btrfs_join_transaction(root);
1738 BUG_ON(IS_ERR(trans)); 1744 BUG_ON(IS_ERR(trans));
1739 trans->block_rsv = &root->fs_info->delalloc_block_rsv; 1745 trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1740 ret = btrfs_update_inode(trans, root, inode); 1746 ret = btrfs_update_inode_fallback(trans, root, inode);
1741 BUG_ON(ret); 1747 BUG_ON(ret);
1742 } 1748 }
1743 goto out; 1749 goto out;
@@ -1787,17 +1793,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1787 1793
1788 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); 1794 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
1789 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { 1795 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
1790 ret = btrfs_update_inode(trans, root, inode); 1796 ret = btrfs_update_inode_fallback(trans, root, inode);
1791 BUG_ON(ret); 1797 BUG_ON(ret);
1792 } 1798 }
1793 ret = 0; 1799 ret = 0;
1794out: 1800out:
1795 if (nolock) { 1801 if (root != root->fs_info->tree_root)
1796 if (trans)
1797 btrfs_end_transaction_nolock(trans, root);
1798 } else {
1799 btrfs_delalloc_release_metadata(inode, ordered_extent->len); 1802 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
1800 if (trans) 1803 if (trans) {
1804 if (nolock)
1805 btrfs_end_transaction_nolock(trans, root);
1806 else
1801 btrfs_end_transaction(trans, root); 1807 btrfs_end_transaction(trans, root);
1802 } 1808 }
1803 1809
@@ -1819,153 +1825,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1825}
1820 1826
1821/* 1827/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = failed_bio->bi_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1828 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1829 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1830 * extent_io.c will try to find good copies for us.
1969 */ 1831 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1832static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1833 struct extent_state *state)
@@ -2011,10 +1873,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1873
2012 kunmap_atomic(kaddr, KM_USER0); 1874 kunmap_atomic(kaddr, KM_USER0);
2013good: 1875good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1876 return 0;
2019 1877
2020zeroit: 1878zeroit:
@@ -2079,89 +1937,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
2079 up_read(&root->fs_info->cleanup_work_sem); 1937 up_read(&root->fs_info->cleanup_work_sem);
2080} 1938}
2081 1939
2082/*
2083 * calculate extra metadata reservation when snapshotting a subvolume
2084 * contains orphan files.
2085 */
2086void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
2087 struct btrfs_pending_snapshot *pending,
2088 u64 *bytes_to_reserve)
2089{
2090 struct btrfs_root *root;
2091 struct btrfs_block_rsv *block_rsv;
2092 u64 num_bytes;
2093 int index;
2094
2095 root = pending->root;
2096 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2097 return;
2098
2099 block_rsv = root->orphan_block_rsv;
2100
2101 /* orphan block reservation for the snapshot */
2102 num_bytes = block_rsv->size;
2103
2104 /*
2105 * after the snapshot is created, COWing tree blocks may use more
2106 * space than it frees. So we should make sure there is enough
2107 * reserved space.
2108 */
2109 index = trans->transid & 0x1;
2110 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2111 num_bytes += block_rsv->size -
2112 (block_rsv->reserved + block_rsv->freed[index]);
2113 }
2114
2115 *bytes_to_reserve += num_bytes;
2116}
2117
2118void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
2119 struct btrfs_pending_snapshot *pending)
2120{
2121 struct btrfs_root *root = pending->root;
2122 struct btrfs_root *snap = pending->snap;
2123 struct btrfs_block_rsv *block_rsv;
2124 u64 num_bytes;
2125 int index;
2126 int ret;
2127
2128 if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
2129 return;
2130
2131 /* refill source subvolume's orphan block reservation */
2132 block_rsv = root->orphan_block_rsv;
2133 index = trans->transid & 0x1;
2134 if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
2135 num_bytes = block_rsv->size -
2136 (block_rsv->reserved + block_rsv->freed[index]);
2137 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2138 root->orphan_block_rsv,
2139 num_bytes);
2140 BUG_ON(ret);
2141 }
2142
2143 /* setup orphan block reservation for the snapshot */
2144 block_rsv = btrfs_alloc_block_rsv(snap);
2145 BUG_ON(!block_rsv);
2146
2147 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2148 snap->orphan_block_rsv = block_rsv;
2149
2150 num_bytes = root->orphan_block_rsv->size;
2151 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
2152 block_rsv, num_bytes);
2153 BUG_ON(ret);
2154
2155#if 0
2156 /* insert orphan item for the snapshot */
2157 WARN_ON(!root->orphan_item_inserted);
2158 ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2159 snap->root_key.objectid);
2160 BUG_ON(ret);
2161 snap->orphan_item_inserted = 1;
2162#endif
2163}
2164
2165enum btrfs_orphan_cleanup_state { 1940enum btrfs_orphan_cleanup_state {
2166 ORPHAN_CLEANUP_STARTED = 1, 1941 ORPHAN_CLEANUP_STARTED = 1,
2167 ORPHAN_CLEANUP_DONE = 2, 1942 ORPHAN_CLEANUP_DONE = 2,
@@ -2247,9 +2022,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2247 } 2022 }
2248 spin_unlock(&root->orphan_lock); 2023 spin_unlock(&root->orphan_lock);
2249 2024
2250 if (block_rsv)
2251 btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
2252
2253 /* grab metadata reservation from transaction handle */ 2025 /* grab metadata reservation from transaction handle */
2254 if (reserve) { 2026 if (reserve) {
2255 ret = btrfs_orphan_reserve_metadata(trans, inode); 2027 ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2316,6 +2088,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2316 struct btrfs_key key, found_key; 2088 struct btrfs_key key, found_key;
2317 struct btrfs_trans_handle *trans; 2089 struct btrfs_trans_handle *trans;
2318 struct inode *inode; 2090 struct inode *inode;
2091 u64 last_objectid = 0;
2319 int ret = 0, nr_unlink = 0, nr_truncate = 0; 2092 int ret = 0, nr_unlink = 0, nr_truncate = 0;
2320 2093
2321 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED)) 2094 if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2367,41 +2140,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2367 * crossing root thing. we store the inode number in the 2140 * crossing root thing. we store the inode number in the
2368 * offset of the orphan item. 2141 * offset of the orphan item.
2369 */ 2142 */
2143
2144 if (found_key.offset == last_objectid) {
2145 printk(KERN_ERR "btrfs: Error removing orphan entry, "
2146 "stopping orphan cleanup\n");
2147 ret = -EINVAL;
2148 goto out;
2149 }
2150
2151 last_objectid = found_key.offset;
2152
2370 found_key.objectid = found_key.offset; 2153 found_key.objectid = found_key.offset;
2371 found_key.type = BTRFS_INODE_ITEM_KEY; 2154 found_key.type = BTRFS_INODE_ITEM_KEY;
2372 found_key.offset = 0; 2155 found_key.offset = 0;
2373 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL); 2156 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2374 if (IS_ERR(inode)) { 2157 ret = PTR_RET(inode);
2375 ret = PTR_ERR(inode); 2158 if (ret && ret != -ESTALE)
2376 goto out; 2159 goto out;
2377 }
2378 2160
2379 /* 2161 /*
2380 * add this inode to the orphan list so btrfs_orphan_del does 2162 * Inode is already gone but the orphan item is still there,
2381 * the proper thing when we hit it 2163 * kill the orphan item.
2382 */ 2164 */
2383 spin_lock(&root->orphan_lock); 2165 if (ret == -ESTALE) {
2384 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); 2166 trans = btrfs_start_transaction(root, 1);
2385 spin_unlock(&root->orphan_lock);
2386
2387 /*
2388 * if this is a bad inode, means we actually succeeded in
2389 * removing the inode, but not the orphan record, which means
2390 * we need to manually delete the orphan since iput will just
2391 * do a destroy_inode
2392 */
2393 if (is_bad_inode(inode)) {
2394 trans = btrfs_start_transaction(root, 0);
2395 if (IS_ERR(trans)) { 2167 if (IS_ERR(trans)) {
2396 ret = PTR_ERR(trans); 2168 ret = PTR_ERR(trans);
2397 goto out; 2169 goto out;
2398 } 2170 }
2399 btrfs_orphan_del(trans, inode); 2171 ret = btrfs_del_orphan_item(trans, root,
2172 found_key.objectid);
2173 BUG_ON(ret);
2400 btrfs_end_transaction(trans, root); 2174 btrfs_end_transaction(trans, root);
2401 iput(inode);
2402 continue; 2175 continue;
2403 } 2176 }
2404 2177
2178 /*
2179 * add this inode to the orphan list so btrfs_orphan_del does
2180 * the proper thing when we hit it
2181 */
2182 spin_lock(&root->orphan_lock);
2183 list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
2184 spin_unlock(&root->orphan_lock);
2185
2405 /* if we have links, this was a truncate, lets do that */ 2186 /* if we have links, this was a truncate, lets do that */
2406 if (inode->i_nlink) { 2187 if (inode->i_nlink) {
2407 if (!S_ISREG(inode->i_mode)) { 2188 if (!S_ISREG(inode->i_mode)) {
@@ -2420,6 +2201,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
2420 if (ret) 2201 if (ret)
2421 goto out; 2202 goto out;
2422 } 2203 }
2204 /* release the path since we're done with it */
2205 btrfs_release_path(path);
2206
2423 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE; 2207 root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
2424 2208
2425 if (root->orphan_block_rsv) 2209 if (root->orphan_block_rsv)
@@ -2534,7 +2318,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
2534 inode_item = btrfs_item_ptr(leaf, path->slots[0], 2318 inode_item = btrfs_item_ptr(leaf, path->slots[0],
2535 struct btrfs_inode_item); 2319 struct btrfs_inode_item);
2536 inode->i_mode = btrfs_inode_mode(leaf, inode_item); 2320 inode->i_mode = btrfs_inode_mode(leaf, inode_item);
2537 inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); 2321 set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
2538 inode->i_uid = btrfs_inode_uid(leaf, inode_item); 2322 inode->i_uid = btrfs_inode_uid(leaf, inode_item);
2539 inode->i_gid = btrfs_inode_gid(leaf, inode_item); 2323 inode->i_gid = btrfs_inode_gid(leaf, inode_item);
2540 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); 2324 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
@@ -2647,7 +2431,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
2647/* 2431/*
2648 * copy everything in the in-memory inode into the btree. 2432 * copy everything in the in-memory inode into the btree.
2649 */ 2433 */
2650noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, 2434static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
2651 struct btrfs_root *root, struct inode *inode) 2435 struct btrfs_root *root, struct inode *inode)
2652{ 2436{
2653 struct btrfs_inode_item *inode_item; 2437 struct btrfs_inode_item *inode_item;
@@ -2655,21 +2439,6 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2655 struct extent_buffer *leaf; 2439 struct extent_buffer *leaf;
2656 int ret; 2440 int ret;
2657 2441
2658 /*
2659 * If the inode is a free space inode, we can deadlock during commit
2660 * if we put it into the delayed code.
2661 *
2662 * The data relocation inode should also be directly updated
2663 * without delay
2664 */
2665 if (!btrfs_is_free_space_inode(root, inode)
2666 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2667 ret = btrfs_delayed_update_inode(trans, root, inode);
2668 if (!ret)
2669 btrfs_set_inode_last_trans(trans, inode);
2670 return ret;
2671 }
2672
2673 path = btrfs_alloc_path(); 2442 path = btrfs_alloc_path();
2674 if (!path) 2443 if (!path)
2675 return -ENOMEM; 2444 return -ENOMEM;
@@ -2698,6 +2467,43 @@ failed:
2698} 2467}
2699 2468
2700/* 2469/*
2470 * copy everything in the in-memory inode into the btree.
2471 */
2472noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
2473 struct btrfs_root *root, struct inode *inode)
2474{
2475 int ret;
2476
2477 /*
2478 * If the inode is a free space inode, we can deadlock during commit
2479 * if we put it into the delayed code.
2480 *
2481 * The data relocation inode should also be directly updated
2482 * without delay
2483 */
2484 if (!btrfs_is_free_space_inode(root, inode)
2485 && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
2486 ret = btrfs_delayed_update_inode(trans, root, inode);
2487 if (!ret)
2488 btrfs_set_inode_last_trans(trans, inode);
2489 return ret;
2490 }
2491
2492 return btrfs_update_inode_item(trans, root, inode);
2493}
2494
2495static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
2496 struct btrfs_root *root, struct inode *inode)
2497{
2498 int ret;
2499
2500 ret = btrfs_update_inode(trans, root, inode);
2501 if (ret == -ENOSPC)
2502 return btrfs_update_inode_item(trans, root, inode);
2503 return ret;
2504}
2505
2506/*
2701 * unlink helper that gets used here in inode.c and in the tree logging 2507 * unlink helper that gets used here in inode.c and in the tree logging
2702 * recovery code. It remove a link in a directory with a given name, and 2508 * recovery code. It remove a link in a directory with a given name, and
2703 * also drops the back refs in the inode to the directory 2509 * also drops the back refs in the inode to the directory
@@ -2835,7 +2641,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2835 u64 ino = btrfs_ino(inode); 2641 u64 ino = btrfs_ino(inode);
2836 u64 dir_ino = btrfs_ino(dir); 2642 u64 dir_ino = btrfs_ino(dir);
2837 2643
2838 trans = btrfs_start_transaction(root, 10); 2644 /*
2645 * 1 for the possible orphan item
2646 * 1 for the dir item
2647 * 1 for the dir index
2648 * 1 for the inode ref
2649 * 1 for the inode ref in the tree log
2650 * 2 for the dir entries in the log
2651 * 1 for the inode
2652 */
2653 trans = btrfs_start_transaction(root, 8);
2839 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC) 2654 if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
2840 return trans; 2655 return trans;
2841 2656
@@ -2858,7 +2673,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2858 return ERR_PTR(-ENOMEM); 2673 return ERR_PTR(-ENOMEM);
2859 } 2674 }
2860 2675
2861 trans = btrfs_start_transaction(root, 0); 2676 /* 1 for the orphan item */
2677 trans = btrfs_start_transaction(root, 1);
2862 if (IS_ERR(trans)) { 2678 if (IS_ERR(trans)) {
2863 btrfs_free_path(path); 2679 btrfs_free_path(path);
2864 root->fs_info->enospc_unlink = 0; 2680 root->fs_info->enospc_unlink = 0;
@@ -2963,6 +2779,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
2963 err = 0; 2779 err = 0;
2964out: 2780out:
2965 btrfs_free_path(path); 2781 btrfs_free_path(path);
2782 /* Migrate the orphan reservation over */
2783 if (!err)
2784 err = btrfs_block_rsv_migrate(trans->block_rsv,
2785 &root->fs_info->global_block_rsv,
2786 trans->bytes_reserved);
2787
2966 if (err) { 2788 if (err) {
2967 btrfs_end_transaction(trans, root); 2789 btrfs_end_transaction(trans, root);
2968 root->fs_info->enospc_unlink = 0; 2790 root->fs_info->enospc_unlink = 0;
@@ -2977,6 +2799,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
2977 struct btrfs_root *root) 2799 struct btrfs_root *root)
2978{ 2800{
2979 if (trans->block_rsv == &root->fs_info->global_block_rsv) { 2801 if (trans->block_rsv == &root->fs_info->global_block_rsv) {
2802 btrfs_block_rsv_release(root, trans->block_rsv,
2803 trans->bytes_reserved);
2804 trans->block_rsv = &root->fs_info->trans_block_rsv;
2980 BUG_ON(!root->fs_info->enospc_unlink); 2805 BUG_ON(!root->fs_info->enospc_unlink);
2981 root->fs_info->enospc_unlink = 0; 2806 root->fs_info->enospc_unlink = 0;
2982 } 2807 }
@@ -3368,6 +3193,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3368 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3193 pgoff_t index = from >> PAGE_CACHE_SHIFT;
3369 unsigned offset = from & (PAGE_CACHE_SIZE-1); 3194 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3370 struct page *page; 3195 struct page *page;
3196 gfp_t mask = btrfs_alloc_write_mask(mapping);
3371 int ret = 0; 3197 int ret = 0;
3372 u64 page_start; 3198 u64 page_start;
3373 u64 page_end; 3199 u64 page_end;
@@ -3380,7 +3206,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3380 3206
3381 ret = -ENOMEM; 3207 ret = -ENOMEM;
3382again: 3208again:
3383 page = find_or_create_page(mapping, index, GFP_NOFS); 3209 page = find_or_create_page(mapping, index, mask);
3384 if (!page) { 3210 if (!page) {
3385 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); 3211 btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
3386 goto out; 3212 goto out;
@@ -3613,6 +3439,8 @@ void btrfs_evict_inode(struct inode *inode)
3613{ 3439{
3614 struct btrfs_trans_handle *trans; 3440 struct btrfs_trans_handle *trans;
3615 struct btrfs_root *root = BTRFS_I(inode)->root; 3441 struct btrfs_root *root = BTRFS_I(inode)->root;
3442 struct btrfs_block_rsv *rsv, *global_rsv;
3443 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
3616 unsigned long nr; 3444 unsigned long nr;
3617 int ret; 3445 int ret;
3618 3446
@@ -3640,22 +3468,55 @@ void btrfs_evict_inode(struct inode *inode)
3640 goto no_delete; 3468 goto no_delete;
3641 } 3469 }
3642 3470
3471 rsv = btrfs_alloc_block_rsv(root);
3472 if (!rsv) {
3473 btrfs_orphan_del(NULL, inode);
3474 goto no_delete;
3475 }
3476 rsv->size = min_size;
3477 global_rsv = &root->fs_info->global_block_rsv;
3478
3643 btrfs_i_size_write(inode, 0); 3479 btrfs_i_size_write(inode, 0);
3644 3480
3481 /*
3482 * This is a bit simpler than btrfs_truncate since
3483 *
3484 * 1) We've already reserved our space for our orphan item in the
3485 * unlink.
3486 * 2) We're going to delete the inode item, so we don't need to update
3487 * it at all.
3488 *
3489 * So we just need to reserve some slack space in case we add bytes when
3490 * doing the truncate.
3491 */
3645 while (1) { 3492 while (1) {
3646 trans = btrfs_join_transaction(root); 3493 ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
3647 BUG_ON(IS_ERR(trans)); 3494
3648 trans->block_rsv = root->orphan_block_rsv; 3495 /*
3496 * Try and steal from the global reserve since we will
3497 * likely not use this space anyway, we want to try as
3498 * hard as possible to get this to work.
3499 */
3500 if (ret)
3501 ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
3649 3502
3650 ret = btrfs_block_rsv_check(trans, root,
3651 root->orphan_block_rsv, 0, 5);
3652 if (ret) { 3503 if (ret) {
3653 BUG_ON(ret != -EAGAIN); 3504 printk(KERN_WARNING "Could not get space for a "
3654 ret = btrfs_commit_transaction(trans, root); 3505 "delete, will truncate on mount %d\n", ret);
3655 BUG_ON(ret); 3506 btrfs_orphan_del(NULL, inode);
3656 continue; 3507 btrfs_free_block_rsv(root, rsv);
3508 goto no_delete;
3509 }
3510
3511 trans = btrfs_start_transaction(root, 0);
3512 if (IS_ERR(trans)) {
3513 btrfs_orphan_del(NULL, inode);
3514 btrfs_free_block_rsv(root, rsv);
3515 goto no_delete;
3657 } 3516 }
3658 3517
3518 trans->block_rsv = rsv;
3519
3659 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); 3520 ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
3660 if (ret != -EAGAIN) 3521 if (ret != -EAGAIN)
3661 break; 3522 break;
@@ -3664,14 +3525,17 @@ void btrfs_evict_inode(struct inode *inode)
3664 btrfs_end_transaction(trans, root); 3525 btrfs_end_transaction(trans, root);
3665 trans = NULL; 3526 trans = NULL;
3666 btrfs_btree_balance_dirty(root, nr); 3527 btrfs_btree_balance_dirty(root, nr);
3667
3668 } 3528 }
3669 3529
3530 btrfs_free_block_rsv(root, rsv);
3531
3670 if (ret == 0) { 3532 if (ret == 0) {
3533 trans->block_rsv = root->orphan_block_rsv;
3671 ret = btrfs_orphan_del(trans, inode); 3534 ret = btrfs_orphan_del(trans, inode);
3672 BUG_ON(ret); 3535 BUG_ON(ret);
3673 } 3536 }
3674 3537
3538 trans->block_rsv = &root->fs_info->trans_block_rsv;
3675 if (!(root == root->fs_info->tree_root || 3539 if (!(root == root->fs_info->tree_root ||
3676 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)) 3540 root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
3677 btrfs_return_ino(root, btrfs_ino(inode)); 3541 btrfs_return_ino(root, btrfs_ino(inode));
@@ -5795,8 +5659,7 @@ again:
5795 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { 5659 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
5796 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5660 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5797 if (!ret) 5661 if (!ret)
5798 ret = btrfs_update_inode(trans, root, inode); 5662 err = btrfs_update_inode_fallback(trans, root, inode);
5799 err = ret;
5800 goto out; 5663 goto out;
5801 } 5664 }
5802 5665
@@ -5834,7 +5697,7 @@ again:
5834 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list); 5697 add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
5835 ret = btrfs_ordered_update_i_size(inode, 0, ordered); 5698 ret = btrfs_ordered_update_i_size(inode, 0, ordered);
5836 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) 5699 if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
5837 btrfs_update_inode(trans, root, inode); 5700 btrfs_update_inode_fallback(trans, root, inode);
5838 ret = 0; 5701 ret = 0;
5839out_unlock: 5702out_unlock:
5840 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset, 5703 unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
@@ -6289,7 +6152,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6289{ 6152{
6290 struct extent_io_tree *tree; 6153 struct extent_io_tree *tree;
6291 tree = &BTRFS_I(page->mapping->host)->io_tree; 6154 tree = &BTRFS_I(page->mapping->host)->io_tree;
6292 return extent_read_full_page(tree, page, btrfs_get_extent); 6155 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6293} 6156}
6294 6157
6295static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6158static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -6541,6 +6404,7 @@ static int btrfs_truncate(struct inode *inode)
6541 struct btrfs_trans_handle *trans; 6404 struct btrfs_trans_handle *trans;
6542 unsigned long nr; 6405 unsigned long nr;
6543 u64 mask = root->sectorsize - 1; 6406 u64 mask = root->sectorsize - 1;
6407 u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
6544 6408
6545 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); 6409 ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
6546 if (ret) 6410 if (ret)
@@ -6588,19 +6452,23 @@ static int btrfs_truncate(struct inode *inode)
6588 rsv = btrfs_alloc_block_rsv(root); 6452 rsv = btrfs_alloc_block_rsv(root);
6589 if (!rsv) 6453 if (!rsv)
6590 return -ENOMEM; 6454 return -ENOMEM;
6591 btrfs_add_durable_block_rsv(root->fs_info, rsv); 6455 rsv->size = min_size;
6592 6456
6457 /*
6458 * 1 for the truncate slack space
6459 * 1 for the orphan item we're going to add
6460 * 1 for the orphan item deletion
6461 * 1 for updating the inode.
6462 */
6593 trans = btrfs_start_transaction(root, 4); 6463 trans = btrfs_start_transaction(root, 4);
6594 if (IS_ERR(trans)) { 6464 if (IS_ERR(trans)) {
6595 err = PTR_ERR(trans); 6465 err = PTR_ERR(trans);
6596 goto out; 6466 goto out;
6597 } 6467 }
6598 6468
6599 /* 6469 /* Migrate the slack space for the truncate to our reserve */
6600 * Reserve space for the truncate process. Truncate should be adding 6470 ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
6601 * space, but if there are snapshots it may end up using space. 6471 min_size);
6602 */
6603 ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
6604 BUG_ON(ret); 6472 BUG_ON(ret);
6605 6473
6606 ret = btrfs_orphan_add(trans, inode); 6474 ret = btrfs_orphan_add(trans, inode);
@@ -6609,21 +6477,6 @@ static int btrfs_truncate(struct inode *inode)
6609 goto out; 6477 goto out;
6610 } 6478 }
6611 6479
6612 nr = trans->blocks_used;
6613 btrfs_end_transaction(trans, root);
6614 btrfs_btree_balance_dirty(root, nr);
6615
6616 /*
6617 * Ok so we've already migrated our bytes over for the truncate, so here
6618 * just reserve the one slot we need for updating the inode.
6619 */
6620 trans = btrfs_start_transaction(root, 1);
6621 if (IS_ERR(trans)) {
6622 err = PTR_ERR(trans);
6623 goto out;
6624 }
6625 trans->block_rsv = rsv;
6626
6627 /* 6480 /*
6628 * setattr is responsible for setting the ordered_data_close flag, 6481 * setattr is responsible for setting the ordered_data_close flag,
6629 * but that is only tested during the last file release. That 6482 * but that is only tested during the last file release. That
@@ -6645,20 +6498,30 @@ static int btrfs_truncate(struct inode *inode)
6645 btrfs_add_ordered_operation(trans, root, inode); 6498 btrfs_add_ordered_operation(trans, root, inode);
6646 6499
6647 while (1) { 6500 while (1) {
6501 ret = btrfs_block_rsv_refill(root, rsv, min_size);
6502 if (ret) {
6503 /*
6504 * This can only happen with the original transaction we
6505 * started above, every other time we shouldn't have a
6506 * transaction started yet.
6507 */
6508 if (ret == -EAGAIN)
6509 goto end_trans;
6510 err = ret;
6511 break;
6512 }
6513
6648 if (!trans) { 6514 if (!trans) {
6649 trans = btrfs_start_transaction(root, 3); 6515 /* Just need the 1 for updating the inode */
6516 trans = btrfs_start_transaction(root, 1);
6650 if (IS_ERR(trans)) { 6517 if (IS_ERR(trans)) {
6651 err = PTR_ERR(trans); 6518 err = PTR_ERR(trans);
6652 goto out; 6519 goto out;
6653 } 6520 }
6654
6655 ret = btrfs_truncate_reserve_metadata(trans, root,
6656 rsv);
6657 BUG_ON(ret);
6658
6659 trans->block_rsv = rsv;
6660 } 6521 }
6661 6522
6523 trans->block_rsv = rsv;
6524
6662 ret = btrfs_truncate_inode_items(trans, root, inode, 6525 ret = btrfs_truncate_inode_items(trans, root, inode,
6663 inode->i_size, 6526 inode->i_size,
6664 BTRFS_EXTENT_DATA_KEY); 6527 BTRFS_EXTENT_DATA_KEY);
@@ -6673,7 +6536,7 @@ static int btrfs_truncate(struct inode *inode)
6673 err = ret; 6536 err = ret;
6674 break; 6537 break;
6675 } 6538 }
6676 6539end_trans:
6677 nr = trans->blocks_used; 6540 nr = trans->blocks_used;
6678 btrfs_end_transaction(trans, root); 6541 btrfs_end_transaction(trans, root);
6679 trans = NULL; 6542 trans = NULL;
@@ -6693,14 +6556,16 @@ static int btrfs_truncate(struct inode *inode)
6693 ret = btrfs_orphan_del(NULL, inode); 6556 ret = btrfs_orphan_del(NULL, inode);
6694 } 6557 }
6695 6558
6696 trans->block_rsv = &root->fs_info->trans_block_rsv; 6559 if (trans) {
6697 ret = btrfs_update_inode(trans, root, inode); 6560 trans->block_rsv = &root->fs_info->trans_block_rsv;
6698 if (ret && !err) 6561 ret = btrfs_update_inode(trans, root, inode);
6699 err = ret; 6562 if (ret && !err)
6563 err = ret;
6700 6564
6701 nr = trans->blocks_used; 6565 nr = trans->blocks_used;
6702 ret = btrfs_end_transaction_throttle(trans, root); 6566 ret = btrfs_end_transaction_throttle(trans, root);
6703 btrfs_btree_balance_dirty(root, nr); 6567 btrfs_btree_balance_dirty(root, nr);
6568 }
6704 6569
6705out: 6570out:
6706 btrfs_free_block_rsv(root, rsv); 6571 btrfs_free_block_rsv(root, rsv);
@@ -6728,7 +6593,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
6728 inode->i_op = &btrfs_dir_inode_operations; 6593 inode->i_op = &btrfs_dir_inode_operations;
6729 inode->i_fop = &btrfs_dir_file_operations; 6594 inode->i_fop = &btrfs_dir_file_operations;
6730 6595
6731 inode->i_nlink = 1; 6596 set_nlink(inode, 1);
6732 btrfs_i_size_write(inode, 0); 6597 btrfs_i_size_write(inode, 0);
6733 6598
6734 err = btrfs_update_inode(trans, new_root, inode); 6599 err = btrfs_update_inode(trans, new_root, inode);
@@ -6755,9 +6620,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6755 ei->last_sub_trans = 0; 6620 ei->last_sub_trans = 0;
6756 ei->logged_trans = 0; 6621 ei->logged_trans = 0;
6757 ei->delalloc_bytes = 0; 6622 ei->delalloc_bytes = 0;
6758 ei->reserved_bytes = 0;
6759 ei->disk_i_size = 0; 6623 ei->disk_i_size = 0;
6760 ei->flags = 0; 6624 ei->flags = 0;
6625 ei->csum_bytes = 0;
6761 ei->index_cnt = (u64)-1; 6626 ei->index_cnt = (u64)-1;
6762 ei->last_unlink_trans = 0; 6627 ei->last_unlink_trans = 0;
6763 6628
@@ -6769,6 +6634,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
6769 ei->orphan_meta_reserved = 0; 6634 ei->orphan_meta_reserved = 0;
6770 ei->dummy_inode = 0; 6635 ei->dummy_inode = 0;
6771 ei->in_defrag = 0; 6636 ei->in_defrag = 0;
6637 ei->delalloc_meta_reserved = 0;
6772 ei->force_compress = BTRFS_COMPRESS_NONE; 6638 ei->force_compress = BTRFS_COMPRESS_NONE;
6773 6639
6774 ei->delayed_node = NULL; 6640 ei->delayed_node = NULL;
@@ -6803,6 +6669,8 @@ void btrfs_destroy_inode(struct inode *inode)
6803 WARN_ON(inode->i_data.nrpages); 6669 WARN_ON(inode->i_data.nrpages);
6804 WARN_ON(BTRFS_I(inode)->outstanding_extents); 6670 WARN_ON(BTRFS_I(inode)->outstanding_extents);
6805 WARN_ON(BTRFS_I(inode)->reserved_extents); 6671 WARN_ON(BTRFS_I(inode)->reserved_extents);
6672 WARN_ON(BTRFS_I(inode)->delalloc_bytes);
6673 WARN_ON(BTRFS_I(inode)->csum_bytes);
6806 6674
6807 /* 6675 /*
6808 * This can happen where we create an inode, but somebody else also 6676 * This can happen where we create an inode, but somebody else also
@@ -6926,11 +6794,13 @@ static int btrfs_getattr(struct vfsmount *mnt,
6926 struct dentry *dentry, struct kstat *stat) 6794 struct dentry *dentry, struct kstat *stat)
6927{ 6795{
6928 struct inode *inode = dentry->d_inode; 6796 struct inode *inode = dentry->d_inode;
6797 u32 blocksize = inode->i_sb->s_blocksize;
6798
6929 generic_fillattr(inode, stat); 6799 generic_fillattr(inode, stat);
6930 stat->dev = BTRFS_I(inode)->root->anon_dev; 6800 stat->dev = BTRFS_I(inode)->root->anon_dev;
6931 stat->blksize = PAGE_CACHE_SIZE; 6801 stat->blksize = PAGE_CACHE_SIZE;
6932 stat->blocks = (inode_get_bytes(inode) + 6802 stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
6933 BTRFS_I(inode)->delalloc_bytes) >> 9; 6803 ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
6934 return 0; 6804 return 0;
6935} 6805}
6936 6806
@@ -7420,7 +7290,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7290 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7291 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7292 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7293 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7294 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7295 .merge_extent_hook = btrfs_merge_extent_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index dae5dfe41ba5..72d461656f60 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -117,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
117/* 118/*
118 * Inherit flags from the parent inode. 119 * Inherit flags from the parent inode.
119 * 120 *
120 * Unlike extN we don't have any flags we don't want to inherit currently. 121 * Currently only the compression flags and the cow flags are inherited.
121 */ 122 */
122void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) 123void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
123{ 124{
@@ -128,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
128 129
129 flags = BTRFS_I(dir)->flags; 130 flags = BTRFS_I(dir)->flags;
130 131
131 if (S_ISREG(inode->i_mode)) 132 if (flags & BTRFS_INODE_NOCOMPRESS) {
132 flags &= ~BTRFS_INODE_DIRSYNC; 133 BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
133 else if (!S_ISDIR(inode->i_mode)) 134 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
134 flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME); 135 } else if (flags & BTRFS_INODE_COMPRESS) {
136 BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
137 BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
138 }
139
140 if (flags & BTRFS_INODE_NODATACOW)
141 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
135 142
136 BTRFS_I(inode)->flags = flags;
137 btrfs_update_iflags(inode); 143 btrfs_update_iflags(inode);
138} 144}
139 145
@@ -277,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
277 struct fstrim_range range; 283 struct fstrim_range range;
278 u64 minlen = ULLONG_MAX; 284 u64 minlen = ULLONG_MAX;
279 u64 num_devices = 0; 285 u64 num_devices = 0;
286 u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
280 int ret; 287 int ret;
281 288
282 if (!capable(CAP_SYS_ADMIN)) 289 if (!capable(CAP_SYS_ADMIN))
@@ -295,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
295 } 302 }
296 } 303 }
297 rcu_read_unlock(); 304 rcu_read_unlock();
305
298 if (!num_devices) 306 if (!num_devices)
299 return -EOPNOTSUPP; 307 return -EOPNOTSUPP;
300
301 if (copy_from_user(&range, arg, sizeof(range))) 308 if (copy_from_user(&range, arg, sizeof(range)))
302 return -EFAULT; 309 return -EFAULT;
310 if (range.start > total_bytes)
311 return -EINVAL;
303 312
313 range.len = min(range.len, total_bytes - range.start);
304 range.minlen = max(range.minlen, minlen); 314 range.minlen = max(range.minlen, minlen);
305 ret = btrfs_trim_fs(root, &range); 315 ret = btrfs_trim_fs(root, &range);
306 if (ret < 0) 316 if (ret < 0)
@@ -760,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
760 int ret = 1; 770 int ret = 1;
761 771
762 /* 772 /*
763 * make sure that once we start defragging and extent, we keep on 773 * make sure that once we start defragging an extent, we keep on
764 * defragging it 774 * defragging it
765 */ 775 */
766 if (start < *defrag_end) 776 if (start < *defrag_end)
@@ -805,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
805 * extent will force at least part of that big extent to be defragged. 815 * extent will force at least part of that big extent to be defragged.
806 */ 816 */
807 if (ret) { 817 if (ret) {
808 *last_len += len;
809 *defrag_end = extent_map_end(em); 818 *defrag_end = extent_map_end(em);
810 } else { 819 } else {
811 *last_len = 0; 820 *last_len = 0;
@@ -843,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
843 int i_done; 852 int i_done;
844 struct btrfs_ordered_extent *ordered; 853 struct btrfs_ordered_extent *ordered;
845 struct extent_state *cached_state = NULL; 854 struct extent_state *cached_state = NULL;
855 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
846 856
847 if (isize == 0) 857 if (isize == 0)
848 return 0; 858 return 0;
@@ -860,7 +870,7 @@ again:
860 for (i = 0; i < num_pages; i++) { 870 for (i = 0; i < num_pages; i++) {
861 struct page *page; 871 struct page *page;
862 page = find_or_create_page(inode->i_mapping, 872 page = find_or_create_page(inode->i_mapping,
863 start_index + i, GFP_NOFS); 873 start_index + i, mask);
864 if (!page) 874 if (!page)
865 break; 875 break;
866 876
@@ -972,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
972 struct btrfs_super_block *disk_super; 982 struct btrfs_super_block *disk_super;
973 struct file_ra_state *ra = NULL; 983 struct file_ra_state *ra = NULL;
974 unsigned long last_index; 984 unsigned long last_index;
985 u64 isize = i_size_read(inode);
975 u64 features; 986 u64 features;
976 u64 last_len = 0; 987 u64 last_len = 0;
977 u64 skip = 0; 988 u64 skip = 0;
978 u64 defrag_end = 0; 989 u64 defrag_end = 0;
979 u64 newer_off = range->start; 990 u64 newer_off = range->start;
980 int newer_left = 0;
981 unsigned long i; 991 unsigned long i;
992 unsigned long ra_index = 0;
982 int ret; 993 int ret;
983 int defrag_count = 0; 994 int defrag_count = 0;
984 int compress_type = BTRFS_COMPRESS_ZLIB; 995 int compress_type = BTRFS_COMPRESS_ZLIB;
985 int extent_thresh = range->extent_thresh; 996 int extent_thresh = range->extent_thresh;
986 int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; 997 int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
998 int cluster = max_cluster;
987 u64 new_align = ~((u64)128 * 1024 - 1); 999 u64 new_align = ~((u64)128 * 1024 - 1);
988 struct page **pages = NULL; 1000 struct page **pages = NULL;
989 1001
@@ -997,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
997 compress_type = range->compress_type; 1009 compress_type = range->compress_type;
998 } 1010 }
999 1011
1000 if (inode->i_size == 0) 1012 if (isize == 0)
1001 return 0; 1013 return 0;
1002 1014
1003 /* 1015 /*
@@ -1013,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1013 ra = &file->f_ra; 1025 ra = &file->f_ra;
1014 } 1026 }
1015 1027
1016 pages = kmalloc(sizeof(struct page *) * newer_cluster, 1028 pages = kmalloc(sizeof(struct page *) * max_cluster,
1017 GFP_NOFS); 1029 GFP_NOFS);
1018 if (!pages) { 1030 if (!pages) {
1019 ret = -ENOMEM; 1031 ret = -ENOMEM;
@@ -1022,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1022 1034
1023 /* find the last page to defrag */ 1035 /* find the last page to defrag */
1024 if (range->start + range->len > range->start) { 1036 if (range->start + range->len > range->start) {
1025 last_index = min_t(u64, inode->i_size - 1, 1037 last_index = min_t(u64, isize - 1,
1026 range->start + range->len - 1) >> PAGE_CACHE_SHIFT; 1038 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
1027 } else { 1039 } else {
1028 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; 1040 last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
1029 } 1041 }
1030 1042
1031 if (newer_than) { 1043 if (newer_than) {
@@ -1038,14 +1050,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1038 * the extents in the file evenly spaced 1050 * the extents in the file evenly spaced
1039 */ 1051 */
1040 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1052 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1041 newer_left = newer_cluster;
1042 } else 1053 } else
1043 goto out_ra; 1054 goto out_ra;
1044 } else { 1055 } else {
1045 i = range->start >> PAGE_CACHE_SHIFT; 1056 i = range->start >> PAGE_CACHE_SHIFT;
1046 } 1057 }
1047 if (!max_to_defrag) 1058 if (!max_to_defrag)
1048 max_to_defrag = last_index - 1; 1059 max_to_defrag = last_index;
1049 1060
1050 /* 1061 /*
1051 * make writeback starts from i, so the defrag range can be 1062 * make writeback starts from i, so the defrag range can be
@@ -1079,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1079 i = max(i + 1, next); 1090 i = max(i + 1, next);
1080 continue; 1091 continue;
1081 } 1092 }
1093
1094 if (!newer_than) {
1095 cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
1096 PAGE_CACHE_SHIFT) - i;
1097 cluster = min(cluster, max_cluster);
1098 } else {
1099 cluster = max_cluster;
1100 }
1101
1082 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) 1102 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
1083 BTRFS_I(inode)->force_compress = compress_type; 1103 BTRFS_I(inode)->force_compress = compress_type;
1084 1104
1085 btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); 1105 if (i + cluster > ra_index) {
1106 ra_index = max(i, ra_index);
1107 btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
1108 cluster);
1109 ra_index += max_cluster;
1110 }
1086 1111
1087 ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); 1112 ret = cluster_pages_for_defrag(inode, pages, i, cluster);
1088 if (ret < 0) 1113 if (ret < 0)
1089 goto out_ra; 1114 goto out_ra;
1090 1115
1091 defrag_count += ret; 1116 defrag_count += ret;
1092 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); 1117 balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
1093 i += ret;
1094 1118
1095 if (newer_than) { 1119 if (newer_than) {
1096 if (newer_off == (u64)-1) 1120 if (newer_off == (u64)-1)
@@ -1105,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1105 if (!ret) { 1129 if (!ret) {
1106 range->start = newer_off; 1130 range->start = newer_off;
1107 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; 1131 i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
1108 newer_left = newer_cluster;
1109 } else { 1132 } else {
1110 break; 1133 break;
1111 } 1134 }
1112 } else { 1135 } else {
1113 i++; 1136 if (ret > 0) {
1137 i += ret;
1138 last_len += ret << PAGE_CACHE_SHIFT;
1139 } else {
1140 i++;
1141 last_len = 0;
1142 }
1114 } 1143 }
1115 } 1144 }
1116 1145
@@ -1136,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
1136 mutex_unlock(&inode->i_mutex); 1165 mutex_unlock(&inode->i_mutex);
1137 } 1166 }
1138 1167
1139 disk_super = &root->fs_info->super_copy; 1168 disk_super = root->fs_info->super_copy;
1140 features = btrfs_super_incompat_flags(disk_super); 1169 features = btrfs_super_incompat_flags(disk_super);
1141 if (range->compress_type == BTRFS_COMPRESS_LZO) { 1170 if (range->compress_type == BTRFS_COMPRESS_LZO) {
1142 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO; 1171 features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
1143 btrfs_set_super_incompat_flags(disk_super, features); 1172 btrfs_set_super_incompat_flags(disk_super, features);
1144 } 1173 }
1145 1174
1146 if (!file) 1175 ret = defrag_count;
1147 kfree(ra);
1148 return defrag_count;
1149 1176
1150out_ra: 1177out_ra:
1151 if (!file) 1178 if (!file)
@@ -1189,12 +1216,12 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1189 *devstr = '\0'; 1216 *devstr = '\0';
1190 devstr = vol_args->name; 1217 devstr = vol_args->name;
1191 devid = simple_strtoull(devstr, &end, 10); 1218 devid = simple_strtoull(devstr, &end, 10);
1192 printk(KERN_INFO "resizing devid %llu\n", 1219 printk(KERN_INFO "btrfs: resizing devid %llu\n",
1193 (unsigned long long)devid); 1220 (unsigned long long)devid);
1194 } 1221 }
1195 device = btrfs_find_device(root, devid, NULL, NULL); 1222 device = btrfs_find_device(root, devid, NULL, NULL);
1196 if (!device) { 1223 if (!device) {
1197 printk(KERN_INFO "resizer unable to find device %llu\n", 1224 printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
1198 (unsigned long long)devid); 1225 (unsigned long long)devid);
1199 ret = -EINVAL; 1226 ret = -EINVAL;
1200 goto out_unlock; 1227 goto out_unlock;
@@ -1240,7 +1267,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1240 do_div(new_size, root->sectorsize); 1267 do_div(new_size, root->sectorsize);
1241 new_size *= root->sectorsize; 1268 new_size *= root->sectorsize;
1242 1269
1243 printk(KERN_INFO "new size for %s is %llu\n", 1270 printk(KERN_INFO "btrfs: new size for %s is %llu\n",
1244 device->name, (unsigned long long)new_size); 1271 device->name, (unsigned long long)new_size);
1245 1272
1246 if (new_size > old_size) { 1273 if (new_size > old_size) {
@@ -1251,7 +1278,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
1251 } 1278 }
1252 ret = btrfs_grow_device(trans, device, new_size); 1279 ret = btrfs_grow_device(trans, device, new_size);
1253 btrfs_commit_transaction(trans, root); 1280 btrfs_commit_transaction(trans, root);
1254 } else { 1281 } else if (new_size < old_size) {
1255 ret = btrfs_shrink_device(device, new_size); 1282 ret = btrfs_shrink_device(device, new_size);
1256 } 1283 }
1257 1284
@@ -2587,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2587 return PTR_ERR(trans); 2614 return PTR_ERR(trans);
2588 } 2615 }
2589 2616
2590 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 2617 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
2591 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path, 2618 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
2592 dir_id, "default", 7, 1); 2619 dir_id, "default", 7, 1);
2593 if (IS_ERR_OR_NULL(di)) { 2620 if (IS_ERR_OR_NULL(di)) {
@@ -2603,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2603 btrfs_mark_buffer_dirty(path->nodes[0]); 2630 btrfs_mark_buffer_dirty(path->nodes[0]);
2604 btrfs_free_path(path); 2631 btrfs_free_path(path);
2605 2632
2606 disk_super = &root->fs_info->super_copy; 2633 disk_super = root->fs_info->super_copy;
2607 features = btrfs_super_incompat_flags(disk_super); 2634 features = btrfs_super_incompat_flags(disk_super);
2608 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) { 2635 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
2609 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL; 2636 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2864,6 +2891,147 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2864 return ret; 2891 return ret;
2865} 2892}
2866 2893
2894static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2895{
2896 int ret = 0;
2897 int i;
2898 u64 rel_ptr;
2899 int size;
2900 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2901 struct inode_fs_paths *ipath = NULL;
2902 struct btrfs_path *path;
2903
2904 if (!capable(CAP_SYS_ADMIN))
2905 return -EPERM;
2906
2907 path = btrfs_alloc_path();
2908 if (!path) {
2909 ret = -ENOMEM;
2910 goto out;
2911 }
2912
2913 ipa = memdup_user(arg, sizeof(*ipa));
2914 if (IS_ERR(ipa)) {
2915 ret = PTR_ERR(ipa);
2916 ipa = NULL;
2917 goto out;
2918 }
2919
2920 size = min_t(u32, ipa->size, 4096);
2921 ipath = init_ipath(size, root, path);
2922 if (IS_ERR(ipath)) {
2923 ret = PTR_ERR(ipath);
2924 ipath = NULL;
2925 goto out;
2926 }
2927
2928 ret = paths_from_inode(ipa->inum, ipath);
2929 if (ret < 0)
2930 goto out;
2931
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->val[i] -
2934 (u64)(unsigned long)ipath->fspath->val;
2935 ipath->fspath->val[i] = rel_ptr;
2936 }
2937
2938 ret = copy_to_user((void *)(unsigned long)ipa->fspath,
2939 (void *)(unsigned long)ipath->fspath, size);
2940 if (ret) {
2941 ret = -EFAULT;
2942 goto out;
2943 }
2944
2945out:
2946 btrfs_free_path(path);
2947 free_ipath(ipath);
2948 kfree(ipa);
2949
2950 return ret;
2951}
2952
2953static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2954{
2955 struct btrfs_data_container *inodes = ctx;
2956 const size_t c = 3 * sizeof(u64);
2957
2958 if (inodes->bytes_left >= c) {
2959 inodes->bytes_left -= c;
2960 inodes->val[inodes->elem_cnt] = inum;
2961 inodes->val[inodes->elem_cnt + 1] = offset;
2962 inodes->val[inodes->elem_cnt + 2] = root;
2963 inodes->elem_cnt += 3;
2964 } else {
2965 inodes->bytes_missing += c - inodes->bytes_left;
2966 inodes->bytes_left = 0;
2967 inodes->elem_missed += 3;
2968 }
2969
2970 return 0;
2971}
2972
2973static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2974 void __user *arg)
2975{
2976 int ret = 0;
2977 int size;
2978 u64 extent_offset;
2979 struct btrfs_ioctl_logical_ino_args *loi;
2980 struct btrfs_data_container *inodes = NULL;
2981 struct btrfs_path *path = NULL;
2982 struct btrfs_key key;
2983
2984 if (!capable(CAP_SYS_ADMIN))
2985 return -EPERM;
2986
2987 loi = memdup_user(arg, sizeof(*loi));
2988 if (IS_ERR(loi)) {
2989 ret = PTR_ERR(loi);
2990 loi = NULL;
2991 goto out;
2992 }
2993
2994 path = btrfs_alloc_path();
2995 if (!path) {
2996 ret = -ENOMEM;
2997 goto out;
2998 }
2999
3000 size = min_t(u32, loi->size, 4096);
3001 inodes = init_data_container(size);
3002 if (IS_ERR(inodes)) {
3003 ret = PTR_ERR(inodes);
3004 inodes = NULL;
3005 goto out;
3006 }
3007
3008 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3009
3010 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3011 ret = -ENOENT;
3012 if (ret < 0)
3013 goto out;
3014
3015 extent_offset = loi->logical - key.objectid;
3016 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3017 extent_offset, build_ino_list, inodes);
3018
3019 if (ret < 0)
3020 goto out;
3021
3022 ret = copy_to_user((void *)(unsigned long)loi->inodes,
3023 (void *)(unsigned long)inodes, size);
3024 if (ret)
3025 ret = -EFAULT;
3026
3027out:
3028 btrfs_free_path(path);
3029 kfree(inodes);
3030 kfree(loi);
3031
3032 return ret;
3033}
3034
2867long btrfs_ioctl(struct file *file, unsigned int 3035long btrfs_ioctl(struct file *file, unsigned int
2868 cmd, unsigned long arg) 3036 cmd, unsigned long arg)
2869{ 3037{
@@ -2921,6 +3089,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2921 return btrfs_ioctl_tree_search(file, argp); 3089 return btrfs_ioctl_tree_search(file, argp);
2922 case BTRFS_IOC_INO_LOOKUP: 3090 case BTRFS_IOC_INO_LOOKUP:
2923 return btrfs_ioctl_ino_lookup(file, argp); 3091 return btrfs_ioctl_ino_lookup(file, argp);
3092 case BTRFS_IOC_INO_PATHS:
3093 return btrfs_ioctl_ino_to_path(root, argp);
3094 case BTRFS_IOC_LOGICAL_INO:
3095 return btrfs_ioctl_logical_to_ino(root, argp);
2924 case BTRFS_IOC_SPACE_INFO: 3096 case BTRFS_IOC_SPACE_INFO:
2925 return btrfs_ioctl_space_info(root, argp); 3097 return btrfs_ioctl_space_info(root, argp);
2926 case BTRFS_IOC_SYNC: 3098 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..252ae9915de8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,30 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 __u64 val[0]; /* out */
202};
203
204struct btrfs_ioctl_ino_path_args {
205 __u64 inum; /* in */
206 __u32 size; /* in */
207 __u64 reserved[4];
208 /* struct btrfs_data_container *fspath; out */
209 __u64 fspath; /* out */
210};
211
212struct btrfs_ioctl_logical_ino_args {
213 __u64 logical; /* in */
214 __u32 size; /* in */
215 __u64 reserved[4];
216 /* struct btrfs_data_container *inodes; out */
217 __u64 inodes;
218};
219
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 220#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 221 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 222#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +272,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 272 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 273#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 274 struct btrfs_ioctl_fs_info_args)
275#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
276 struct btrfs_ioctl_ino_path_args)
277#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
278 struct btrfs_ioctl_ino_path_args)
279
251#endif 280#endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) 158void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
159{ 159{
160 int i; 160 int i;
161 u32 type; 161 u32 type, nr;
162 u32 nr = btrfs_header_nritems(l);
163 struct btrfs_item *item; 162 struct btrfs_item *item;
164 struct btrfs_root_item *ri; 163 struct btrfs_root_item *ri;
165 struct btrfs_dir_item *di; 164 struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
172 struct btrfs_key key; 171 struct btrfs_key key;
173 struct btrfs_key found_key; 172 struct btrfs_key found_key;
174 173
174 if (!l)
175 return;
176
177 nr = btrfs_header_nritems(l);
178
175 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", 179 printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
176 (unsigned long long)btrfs_header_bytenr(l), nr, 180 (unsigned long long)btrfs_header_bytenr(l), nr,
177 btrfs_leaf_free_space(root, l)); 181 btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2373b39a132b
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,951 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include <linux/sched.h>
20#include <linux/pagemap.h>
21#include <linux/writeback.h>
22#include <linux/blkdev.h>
23#include <linux/rbtree.h>
24#include <linux/slab.h>
25#include <linux/workqueue.h>
26#include "ctree.h"
27#include "volumes.h"
28#include "disk-io.h"
29#include "transaction.h"
30
31#undef DEBUG
32
33/*
34 * This is the implementation for the generic read ahead framework.
35 *
36 * To trigger a readahead, btrfs_reada_add must be called. It will start
37 * a read ahead for the given range [start, end) on tree root. The returned
38 * handle can either be used to wait on the readahead to finish
39 * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
40 *
41 * The read ahead works as follows:
42 * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
43 * reada_start_machine will then search for extents to prefetch and trigger
44 * some reads. When a read finishes for a node, all contained node/leaf
45 * pointers that lie in the given range will also be enqueued. The reads will
46 * be triggered in sequential order, thus giving a big win over a naive
47 * enumeration. It will also make use of multi-device layouts. Each disk
48 * will have its on read pointer and all disks will by utilized in parallel.
49 * Also will no two disks read both sides of a mirror simultaneously, as this
50 * would waste seeking capacity. Instead both disks will read different parts
51 * of the filesystem.
52 * Any number of readaheads can be started in parallel. The read order will be
53 * determined globally, i.e. 2 parallel readaheads will normally finish faster
54 * than the 2 started one after another.
55 */
56
57#define MAX_MIRRORS 2
58#define MAX_IN_FLIGHT 6
59
60struct reada_extctl {
61 struct list_head list;
62 struct reada_control *rc;
63 u64 generation;
64};
65
66struct reada_extent {
67 u64 logical;
68 struct btrfs_key top;
69 u32 blocksize;
70 int err;
71 struct list_head extctl;
72 struct kref refcnt;
73 spinlock_t lock;
74 struct reada_zone *zones[MAX_MIRRORS];
75 int nzones;
76 struct btrfs_device *scheduled_for;
77};
78
79struct reada_zone {
80 u64 start;
81 u64 end;
82 u64 elems;
83 struct list_head list;
84 spinlock_t lock;
85 int locked;
86 struct btrfs_device *device;
87 struct btrfs_device *devs[MAX_MIRRORS]; /* full list, incl self */
88 int ndevs;
89 struct kref refcnt;
90};
91
92struct reada_machine_work {
93 struct btrfs_work work;
94 struct btrfs_fs_info *fs_info;
95};
96
97static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
98static void reada_control_release(struct kref *kref);
99static void reada_zone_release(struct kref *kref);
100static void reada_start_machine(struct btrfs_fs_info *fs_info);
101static void __reada_start_machine(struct btrfs_fs_info *fs_info);
102
103static int reada_add_block(struct reada_control *rc, u64 logical,
104 struct btrfs_key *top, int level, u64 generation);
105
106/* recurses */
107/* in case of err, eb might be NULL */
108static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
109 u64 start, int err)
110{
111 int level = 0;
112 int nritems;
113 int i;
114 u64 bytenr;
115 u64 generation;
116 struct reada_extent *re;
117 struct btrfs_fs_info *fs_info = root->fs_info;
118 struct list_head list;
119 unsigned long index = start >> PAGE_CACHE_SHIFT;
120 struct btrfs_device *for_dev;
121
122 if (eb)
123 level = btrfs_header_level(eb);
124
125 /* find extent */
126 spin_lock(&fs_info->reada_lock);
127 re = radix_tree_lookup(&fs_info->reada_tree, index);
128 if (re)
129 kref_get(&re->refcnt);
130 spin_unlock(&fs_info->reada_lock);
131
132 if (!re)
133 return -1;
134
135 spin_lock(&re->lock);
136 /*
137 * just take the full list from the extent. afterwards we
138 * don't need the lock anymore
139 */
140 list_replace_init(&re->extctl, &list);
141 for_dev = re->scheduled_for;
142 re->scheduled_for = NULL;
143 spin_unlock(&re->lock);
144
145 if (err == 0) {
146 nritems = level ? btrfs_header_nritems(eb) : 0;
147 generation = btrfs_header_generation(eb);
148 /*
149 * FIXME: currently we just set nritems to 0 if this is a leaf,
150 * effectively ignoring the content. In a next step we could
151 * trigger more readahead depending from the content, e.g.
152 * fetch the checksums for the extents in the leaf.
153 */
154 } else {
155 /*
156 * this is the error case, the extent buffer has not been
157 * read correctly. We won't access anything from it and
158 * just cleanup our data structures. Effectively this will
159 * cut the branch below this node from read ahead.
160 */
161 nritems = 0;
162 generation = 0;
163 }
164
165 for (i = 0; i < nritems; i++) {
166 struct reada_extctl *rec;
167 u64 n_gen;
168 struct btrfs_key key;
169 struct btrfs_key next_key;
170
171 btrfs_node_key_to_cpu(eb, &key, i);
172 if (i + 1 < nritems)
173 btrfs_node_key_to_cpu(eb, &next_key, i + 1);
174 else
175 next_key = re->top;
176 bytenr = btrfs_node_blockptr(eb, i);
177 n_gen = btrfs_node_ptr_generation(eb, i);
178
179 list_for_each_entry(rec, &list, list) {
180 struct reada_control *rc = rec->rc;
181
182 /*
183 * if the generation doesn't match, just ignore this
184 * extctl. This will probably cut off a branch from
185 * prefetch. Alternatively one could start a new (sub-)
186 * prefetch for this branch, starting again from root.
187 * FIXME: move the generation check out of this loop
188 */
189#ifdef DEBUG
190 if (rec->generation != generation) {
191 printk(KERN_DEBUG "generation mismatch for "
192 "(%llu,%d,%llu) %llu != %llu\n",
193 key.objectid, key.type, key.offset,
194 rec->generation, generation);
195 }
196#endif
197 if (rec->generation == generation &&
198 btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
199 btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
200 reada_add_block(rc, bytenr, &next_key,
201 level - 1, n_gen);
202 }
203 }
204 /*
205 * free extctl records
206 */
207 while (!list_empty(&list)) {
208 struct reada_control *rc;
209 struct reada_extctl *rec;
210
211 rec = list_first_entry(&list, struct reada_extctl, list);
212 list_del(&rec->list);
213 rc = rec->rc;
214 kfree(rec);
215
216 kref_get(&rc->refcnt);
217 if (atomic_dec_and_test(&rc->elems)) {
218 kref_put(&rc->refcnt, reada_control_release);
219 wake_up(&rc->wait);
220 }
221 kref_put(&rc->refcnt, reada_control_release);
222
223 reada_extent_put(fs_info, re); /* one ref for each entry */
224 }
225 reada_extent_put(fs_info, re); /* our ref */
226 if (for_dev)
227 atomic_dec(&for_dev->reada_in_flight);
228
229 return 0;
230}
231
232/*
233 * start is passed separately in case eb in NULL, which may be the case with
234 * failed I/O
235 */
236int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
237 u64 start, int err)
238{
239 int ret;
240
241 ret = __readahead_hook(root, eb, start, err);
242
243 reada_start_machine(root->fs_info);
244
245 return ret;
246}
247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_bio *bbio)
251{
252 int ret;
253 int looped = 0;
254 struct reada_zone *zone;
255 struct btrfs_block_group_cache *cache = NULL;
256 u64 start;
257 u64 end;
258 int i;
259
260again:
261 zone = NULL;
262 spin_lock(&fs_info->reada_lock);
263 ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
264 logical >> PAGE_CACHE_SHIFT, 1);
265 if (ret == 1)
266 kref_get(&zone->refcnt);
267 spin_unlock(&fs_info->reada_lock);
268
269 if (ret == 1) {
270 if (logical >= zone->start && logical < zone->end)
271 return zone;
272 spin_lock(&fs_info->reada_lock);
273 kref_put(&zone->refcnt, reada_zone_release);
274 spin_unlock(&fs_info->reada_lock);
275 }
276
277 if (looped)
278 return NULL;
279
280 cache = btrfs_lookup_block_group(fs_info, logical);
281 if (!cache)
282 return NULL;
283
284 start = cache->key.objectid;
285 end = start + cache->key.offset - 1;
286 btrfs_put_block_group(cache);
287
288 zone = kzalloc(sizeof(*zone), GFP_NOFS);
289 if (!zone)
290 return NULL;
291
292 zone->start = start;
293 zone->end = end;
294 INIT_LIST_HEAD(&zone->list);
295 spin_lock_init(&zone->lock);
296 zone->locked = 0;
297 kref_init(&zone->refcnt);
298 zone->elems = 0;
299 zone->device = dev; /* our device always sits at index 0 */
300 for (i = 0; i < bbio->num_stripes; ++i) {
301 /* bounds have already been checked */
302 zone->devs[i] = bbio->stripes[i].dev;
303 }
304 zone->ndevs = bbio->num_stripes;
305
306 spin_lock(&fs_info->reada_lock);
307 ret = radix_tree_insert(&dev->reada_zones,
308 (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
309 zone);
310 spin_unlock(&fs_info->reada_lock);
311
312 if (ret) {
313 kfree(zone);
314 looped = 1;
315 goto again;
316 }
317
318 return zone;
319}
320
321static struct reada_extent *reada_find_extent(struct btrfs_root *root,
322 u64 logical,
323 struct btrfs_key *top, int level)
324{
325 int ret;
326 int looped = 0;
327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_bio *bbio = NULL;
331 struct btrfs_device *dev;
332 u32 blocksize;
333 u64 length;
334 int nzones = 0;
335 int i;
336 unsigned long index = logical >> PAGE_CACHE_SHIFT;
337
338again:
339 spin_lock(&fs_info->reada_lock);
340 re = radix_tree_lookup(&fs_info->reada_tree, index);
341 if (re)
342 kref_get(&re->refcnt);
343 spin_unlock(&fs_info->reada_lock);
344
345 if (re || looped)
346 return re;
347
348 re = kzalloc(sizeof(*re), GFP_NOFS);
349 if (!re)
350 return NULL;
351
352 blocksize = btrfs_level_size(root, level);
353 re->logical = logical;
354 re->blocksize = blocksize;
355 re->top = *top;
356 INIT_LIST_HEAD(&re->extctl);
357 spin_lock_init(&re->lock);
358 kref_init(&re->refcnt);
359
360 /*
361 * map block
362 */
363 length = blocksize;
364 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
365 if (ret || !bbio || length < blocksize)
366 goto error;
367
368 if (bbio->num_stripes > MAX_MIRRORS) {
369 printk(KERN_ERR "btrfs readahead: more than %d copies not "
370 "supported", MAX_MIRRORS);
371 goto error;
372 }
373
374 for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
375 struct reada_zone *zone;
376
377 dev = bbio->stripes[nzones].dev;
378 zone = reada_find_zone(fs_info, dev, logical, bbio);
379 if (!zone)
380 break;
381
382 re->zones[nzones] = zone;
383 spin_lock(&zone->lock);
384 if (!zone->elems)
385 kref_get(&zone->refcnt);
386 ++zone->elems;
387 spin_unlock(&zone->lock);
388 spin_lock(&fs_info->reada_lock);
389 kref_put(&zone->refcnt, reada_zone_release);
390 spin_unlock(&fs_info->reada_lock);
391 }
392 re->nzones = nzones;
393 if (nzones == 0) {
394 /* not a single zone found, error and out */
395 goto error;
396 }
397
398 /* insert extent in reada_tree + all per-device trees, all or nothing */
399 spin_lock(&fs_info->reada_lock);
400 ret = radix_tree_insert(&fs_info->reada_tree, index, re);
401 if (ret) {
402 spin_unlock(&fs_info->reada_lock);
403 if (ret != -ENOMEM) {
404 /* someone inserted the extent in the meantime */
405 looped = 1;
406 }
407 goto error;
408 }
409 for (i = 0; i < nzones; ++i) {
410 dev = bbio->stripes[i].dev;
411 ret = radix_tree_insert(&dev->reada_extents, index, re);
412 if (ret) {
413 while (--i >= 0) {
414 dev = bbio->stripes[i].dev;
415 BUG_ON(dev == NULL);
416 radix_tree_delete(&dev->reada_extents, index);
417 }
418 BUG_ON(fs_info == NULL);
419 radix_tree_delete(&fs_info->reada_tree, index);
420 spin_unlock(&fs_info->reada_lock);
421 goto error;
422 }
423 }
424 spin_unlock(&fs_info->reada_lock);
425
426 kfree(bbio);
427 return re;
428
429error:
430 while (nzones) {
431 struct reada_zone *zone;
432
433 --nzones;
434 zone = re->zones[nzones];
435 kref_get(&zone->refcnt);
436 spin_lock(&zone->lock);
437 --zone->elems;
438 if (zone->elems == 0) {
439 /*
440 * no fs_info->reada_lock needed, as this can't be
441 * the last ref
442 */
443 kref_put(&zone->refcnt, reada_zone_release);
444 }
445 spin_unlock(&zone->lock);
446
447 spin_lock(&fs_info->reada_lock);
448 kref_put(&zone->refcnt, reada_zone_release);
449 spin_unlock(&fs_info->reada_lock);
450 }
451 kfree(bbio);
452 kfree(re);
453 if (looped)
454 goto again;
455 return NULL;
456}
457
458static void reada_kref_dummy(struct kref *kr)
459{
460}
461
462static void reada_extent_put(struct btrfs_fs_info *fs_info,
463 struct reada_extent *re)
464{
465 int i;
466 unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
467
468 spin_lock(&fs_info->reada_lock);
469 if (!kref_put(&re->refcnt, reada_kref_dummy)) {
470 spin_unlock(&fs_info->reada_lock);
471 return;
472 }
473
474 radix_tree_delete(&fs_info->reada_tree, index);
475 for (i = 0; i < re->nzones; ++i) {
476 struct reada_zone *zone = re->zones[i];
477
478 radix_tree_delete(&zone->device->reada_extents, index);
479 }
480
481 spin_unlock(&fs_info->reada_lock);
482
483 for (i = 0; i < re->nzones; ++i) {
484 struct reada_zone *zone = re->zones[i];
485
486 kref_get(&zone->refcnt);
487 spin_lock(&zone->lock);
488 --zone->elems;
489 if (zone->elems == 0) {
490 /* no fs_info->reada_lock needed, as this can't be
491 * the last ref */
492 kref_put(&zone->refcnt, reada_zone_release);
493 }
494 spin_unlock(&zone->lock);
495
496 spin_lock(&fs_info->reada_lock);
497 kref_put(&zone->refcnt, reada_zone_release);
498 spin_unlock(&fs_info->reada_lock);
499 }
500 if (re->scheduled_for)
501 atomic_dec(&re->scheduled_for->reada_in_flight);
502
503 kfree(re);
504}
505
506static void reada_zone_release(struct kref *kref)
507{
508 struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
509
510 radix_tree_delete(&zone->device->reada_zones,
511 zone->end >> PAGE_CACHE_SHIFT);
512
513 kfree(zone);
514}
515
516static void reada_control_release(struct kref *kref)
517{
518 struct reada_control *rc = container_of(kref, struct reada_control,
519 refcnt);
520
521 kfree(rc);
522}
523
524static int reada_add_block(struct reada_control *rc, u64 logical,
525 struct btrfs_key *top, int level, u64 generation)
526{
527 struct btrfs_root *root = rc->root;
528 struct reada_extent *re;
529 struct reada_extctl *rec;
530
531 re = reada_find_extent(root, logical, top, level); /* takes one ref */
532 if (!re)
533 return -1;
534
535 rec = kzalloc(sizeof(*rec), GFP_NOFS);
536 if (!rec) {
537 reada_extent_put(root->fs_info, re);
538 return -1;
539 }
540
541 rec->rc = rc;
542 rec->generation = generation;
543 atomic_inc(&rc->elems);
544
545 spin_lock(&re->lock);
546 list_add_tail(&rec->list, &re->extctl);
547 spin_unlock(&re->lock);
548
549 /* leave the ref on the extent */
550
551 return 0;
552}
553
554/*
555 * called with fs_info->reada_lock held
556 */
557static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
558{
559 int i;
560 unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
561
562 for (i = 0; i < zone->ndevs; ++i) {
563 struct reada_zone *peer;
564 peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
565 if (peer && peer->device != zone->device)
566 peer->locked = lock;
567 }
568}
569
570/*
571 * called with fs_info->reada_lock held
572 */
573static int reada_pick_zone(struct btrfs_device *dev)
574{
575 struct reada_zone *top_zone = NULL;
576 struct reada_zone *top_locked_zone = NULL;
577 u64 top_elems = 0;
578 u64 top_locked_elems = 0;
579 unsigned long index = 0;
580 int ret;
581
582 if (dev->reada_curr_zone) {
583 reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
584 kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
585 dev->reada_curr_zone = NULL;
586 }
587 /* pick the zone with the most elements */
588 while (1) {
589 struct reada_zone *zone;
590
591 ret = radix_tree_gang_lookup(&dev->reada_zones,
592 (void **)&zone, index, 1);
593 if (ret == 0)
594 break;
595 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
596 if (zone->locked) {
597 if (zone->elems > top_locked_elems) {
598 top_locked_elems = zone->elems;
599 top_locked_zone = zone;
600 }
601 } else {
602 if (zone->elems > top_elems) {
603 top_elems = zone->elems;
604 top_zone = zone;
605 }
606 }
607 }
608 if (top_zone)
609 dev->reada_curr_zone = top_zone;
610 else if (top_locked_zone)
611 dev->reada_curr_zone = top_locked_zone;
612 else
613 return 0;
614
615 dev->reada_next = dev->reada_curr_zone->start;
616 kref_get(&dev->reada_curr_zone->refcnt);
617 reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
618
619 return 1;
620}
621
622static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
623 struct btrfs_device *dev)
624{
625 struct reada_extent *re = NULL;
626 int mirror_num = 0;
627 struct extent_buffer *eb = NULL;
628 u64 logical;
629 u32 blocksize;
630 int ret;
631 int i;
632 int need_kick = 0;
633
634 spin_lock(&fs_info->reada_lock);
635 if (dev->reada_curr_zone == NULL) {
636 ret = reada_pick_zone(dev);
637 if (!ret) {
638 spin_unlock(&fs_info->reada_lock);
639 return 0;
640 }
641 }
642 /*
643 * FIXME currently we issue the reads one extent at a time. If we have
644 * a contiguous block of extents, we could also coagulate them or use
645 * plugging to speed things up
646 */
647 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
648 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
649 if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
650 ret = reada_pick_zone(dev);
651 if (!ret) {
652 spin_unlock(&fs_info->reada_lock);
653 return 0;
654 }
655 re = NULL;
656 ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
657 dev->reada_next >> PAGE_CACHE_SHIFT, 1);
658 }
659 if (ret == 0) {
660 spin_unlock(&fs_info->reada_lock);
661 return 0;
662 }
663 dev->reada_next = re->logical + re->blocksize;
664 kref_get(&re->refcnt);
665
666 spin_unlock(&fs_info->reada_lock);
667
668 /*
669 * find mirror num
670 */
671 for (i = 0; i < re->nzones; ++i) {
672 if (re->zones[i]->device == dev) {
673 mirror_num = i + 1;
674 break;
675 }
676 }
677 logical = re->logical;
678 blocksize = re->blocksize;
679
680 spin_lock(&re->lock);
681 if (re->scheduled_for == NULL) {
682 re->scheduled_for = dev;
683 need_kick = 1;
684 }
685 spin_unlock(&re->lock);
686
687 reada_extent_put(fs_info, re);
688
689 if (!need_kick)
690 return 0;
691
692 atomic_inc(&dev->reada_in_flight);
693 ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
694 mirror_num, &eb);
695 if (ret)
696 __readahead_hook(fs_info->extent_root, NULL, logical, ret);
697 else if (eb)
698 __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
699
700 if (eb)
701 free_extent_buffer(eb);
702
703 return 1;
704
705}
706
707static void reada_start_machine_worker(struct btrfs_work *work)
708{
709 struct reada_machine_work *rmw;
710 struct btrfs_fs_info *fs_info;
711
712 rmw = container_of(work, struct reada_machine_work, work);
713 fs_info = rmw->fs_info;
714
715 kfree(rmw);
716
717 __reada_start_machine(fs_info);
718}
719
720static void __reada_start_machine(struct btrfs_fs_info *fs_info)
721{
722 struct btrfs_device *device;
723 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
724 u64 enqueued;
725 u64 total = 0;
726 int i;
727
728 do {
729 enqueued = 0;
730 list_for_each_entry(device, &fs_devices->devices, dev_list) {
731 if (atomic_read(&device->reada_in_flight) <
732 MAX_IN_FLIGHT)
733 enqueued += reada_start_machine_dev(fs_info,
734 device);
735 }
736 total += enqueued;
737 } while (enqueued && total < 10000);
738
739 if (enqueued == 0)
740 return;
741
742 /*
743 * If everything is already in the cache, this is effectively single
744 * threaded. To a) not hold the caller for too long and b) to utilize
745 * more cores, we broke the loop above after 10000 iterations and now
746 * enqueue to workers to finish it. This will distribute the load to
747 * the cores.
748 */
749 for (i = 0; i < 2; ++i)
750 reada_start_machine(fs_info);
751}
752
753static void reada_start_machine(struct btrfs_fs_info *fs_info)
754{
755 struct reada_machine_work *rmw;
756
757 rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
758 if (!rmw) {
759 /* FIXME we cannot handle this properly right now */
760 BUG();
761 }
762 rmw->work.func = reada_start_machine_worker;
763 rmw->fs_info = fs_info;
764
765 btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
766}
767
768#ifdef DEBUG
769static void dump_devs(struct btrfs_fs_info *fs_info, int all)
770{
771 struct btrfs_device *device;
772 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
773 unsigned long index;
774 int ret;
775 int i;
776 int j;
777 int cnt;
778
779 spin_lock(&fs_info->reada_lock);
780 list_for_each_entry(device, &fs_devices->devices, dev_list) {
781 printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
782 atomic_read(&device->reada_in_flight));
783 index = 0;
784 while (1) {
785 struct reada_zone *zone;
786 ret = radix_tree_gang_lookup(&device->reada_zones,
787 (void **)&zone, index, 1);
788 if (ret == 0)
789 break;
790 printk(KERN_DEBUG " zone %llu-%llu elems %llu locked "
791 "%d devs", zone->start, zone->end, zone->elems,
792 zone->locked);
793 for (j = 0; j < zone->ndevs; ++j) {
794 printk(KERN_CONT " %lld",
795 zone->devs[j]->devid);
796 }
797 if (device->reada_curr_zone == zone)
798 printk(KERN_CONT " curr off %llu",
799 device->reada_next - zone->start);
800 printk(KERN_CONT "\n");
801 index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
802 }
803 cnt = 0;
804 index = 0;
805 while (all) {
806 struct reada_extent *re = NULL;
807
808 ret = radix_tree_gang_lookup(&device->reada_extents,
809 (void **)&re, index, 1);
810 if (ret == 0)
811 break;
812 printk(KERN_DEBUG
813 " re: logical %llu size %u empty %d for %lld",
814 re->logical, re->blocksize,
815 list_empty(&re->extctl), re->scheduled_for ?
816 re->scheduled_for->devid : -1);
817
818 for (i = 0; i < re->nzones; ++i) {
819 printk(KERN_CONT " zone %llu-%llu devs",
820 re->zones[i]->start,
821 re->zones[i]->end);
822 for (j = 0; j < re->zones[i]->ndevs; ++j) {
823 printk(KERN_CONT " %lld",
824 re->zones[i]->devs[j]->devid);
825 }
826 }
827 printk(KERN_CONT "\n");
828 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
829 if (++cnt > 15)
830 break;
831 }
832 }
833
834 index = 0;
835 cnt = 0;
836 while (all) {
837 struct reada_extent *re = NULL;
838
839 ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
840 index, 1);
841 if (ret == 0)
842 break;
843 if (!re->scheduled_for) {
844 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
845 continue;
846 }
847 printk(KERN_DEBUG
848 "re: logical %llu size %u list empty %d for %lld",
849 re->logical, re->blocksize, list_empty(&re->extctl),
850 re->scheduled_for ? re->scheduled_for->devid : -1);
851 for (i = 0; i < re->nzones; ++i) {
852 printk(KERN_CONT " zone %llu-%llu devs",
853 re->zones[i]->start,
854 re->zones[i]->end);
855 for (i = 0; i < re->nzones; ++i) {
856 printk(KERN_CONT " zone %llu-%llu devs",
857 re->zones[i]->start,
858 re->zones[i]->end);
859 for (j = 0; j < re->zones[i]->ndevs; ++j) {
860 printk(KERN_CONT " %lld",
861 re->zones[i]->devs[j]->devid);
862 }
863 }
864 }
865 printk(KERN_CONT "\n");
866 index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
867 }
868 spin_unlock(&fs_info->reada_lock);
869}
870#endif
871
872/*
873 * interface
874 */
875struct reada_control *btrfs_reada_add(struct btrfs_root *root,
876 struct btrfs_key *key_start, struct btrfs_key *key_end)
877{
878 struct reada_control *rc;
879 u64 start;
880 u64 generation;
881 int level;
882 struct extent_buffer *node;
883 static struct btrfs_key max_key = {
884 .objectid = (u64)-1,
885 .type = (u8)-1,
886 .offset = (u64)-1
887 };
888
889 rc = kzalloc(sizeof(*rc), GFP_NOFS);
890 if (!rc)
891 return ERR_PTR(-ENOMEM);
892
893 rc->root = root;
894 rc->key_start = *key_start;
895 rc->key_end = *key_end;
896 atomic_set(&rc->elems, 0);
897 init_waitqueue_head(&rc->wait);
898 kref_init(&rc->refcnt);
899 kref_get(&rc->refcnt); /* one ref for having elements */
900
901 node = btrfs_root_node(root);
902 start = node->start;
903 level = btrfs_header_level(node);
904 generation = btrfs_header_generation(node);
905 free_extent_buffer(node);
906
907 reada_add_block(rc, start, &max_key, level, generation);
908
909 reada_start_machine(root->fs_info);
910
911 return rc;
912}
913
914#ifdef DEBUG
915int btrfs_reada_wait(void *handle)
916{
917 struct reada_control *rc = handle;
918
919 while (atomic_read(&rc->elems)) {
920 wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
921 5 * HZ);
922 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
923 }
924
925 dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
926
927 kref_put(&rc->refcnt, reada_control_release);
928
929 return 0;
930}
931#else
932int btrfs_reada_wait(void *handle)
933{
934 struct reada_control *rc = handle;
935
936 while (atomic_read(&rc->elems)) {
937 wait_event(rc->wait, atomic_read(&rc->elems) == 0);
938 }
939
940 kref_put(&rc->refcnt, reada_control_release);
941
942 return 0;
943}
944#endif
945
946void btrfs_reada_detach(void *handle)
947{
948 struct reada_control *rc = handle;
949
950 kref_put(&rc->refcnt, reada_control_release);
951}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..dff29d5e151a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1174,6 +1174,8 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
1174 list_add_tail(&new_edge->list[UPPER], 1174 list_add_tail(&new_edge->list[UPPER],
1175 &new_node->lower); 1175 &new_node->lower);
1176 } 1176 }
1177 } else {
1178 list_add_tail(&new_node->lower, &cache->leaves);
1177 } 1179 }
1178 1180
1179 rb_node = tree_insert(&cache->rb_root, new_node->bytenr, 1181 rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
@@ -2041,8 +2043,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
2041 BUG_ON(IS_ERR(trans)); 2043 BUG_ON(IS_ERR(trans));
2042 trans->block_rsv = rc->block_rsv; 2044 trans->block_rsv = rc->block_rsv;
2043 2045
2044 ret = btrfs_block_rsv_check(trans, root, rc->block_rsv, 2046 ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
2045 min_reserved, 0);
2046 if (ret) { 2047 if (ret) {
2047 BUG_ON(ret != -EAGAIN); 2048 BUG_ON(ret != -EAGAIN);
2048 ret = btrfs_commit_transaction(trans, root); 2049 ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2153,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
2152again: 2153again:
2153 if (!err) { 2154 if (!err) {
2154 num_bytes = rc->merging_rsv_size; 2155 num_bytes = rc->merging_rsv_size;
2155 ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv, 2156 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2156 num_bytes);
2157 if (ret) 2157 if (ret)
2158 err = ret; 2158 err = ret;
2159 } 2159 }
@@ -2427,7 +2427,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2; 2427 num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2428 2428
2429 trans->block_rsv = rc->block_rsv; 2429 trans->block_rsv = rc->block_rsv;
2430 ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes); 2430 ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
2431 if (ret) { 2431 if (ret) {
2432 if (ret == -EAGAIN) 2432 if (ret == -EAGAIN)
2433 rc->commit_transaction = 1; 2433 rc->commit_transaction = 1;
@@ -2922,6 +2922,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2922 unsigned long last_index; 2922 unsigned long last_index;
2923 struct page *page; 2923 struct page *page;
2924 struct file_ra_state *ra; 2924 struct file_ra_state *ra;
2925 gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2925 int nr = 0; 2926 int nr = 0;
2926 int ret = 0; 2927 int ret = 0;
2927 2928
@@ -2956,7 +2957,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2956 ra, NULL, index, 2957 ra, NULL, index,
2957 last_index + 1 - index); 2958 last_index + 1 - index);
2958 page = find_or_create_page(inode->i_mapping, index, 2959 page = find_or_create_page(inode->i_mapping, index,
2959 GFP_NOFS); 2960 mask);
2960 if (!page) { 2961 if (!page) {
2961 btrfs_delalloc_release_metadata(inode, 2962 btrfs_delalloc_release_metadata(inode,
2962 PAGE_CACHE_SIZE); 2963 PAGE_CACHE_SIZE);
@@ -3323,8 +3324,11 @@ static int find_data_references(struct reloc_control *rc,
3323 } 3324 }
3324 3325
3325 key.objectid = ref_objectid; 3326 key.objectid = ref_objectid;
3326 key.offset = ref_offset;
3327 key.type = BTRFS_EXTENT_DATA_KEY; 3327 key.type = BTRFS_EXTENT_DATA_KEY;
3328 if (ref_offset > ((u64)-1 << 32))
3329 key.offset = 0;
3330 else
3331 key.offset = ref_offset;
3328 3332
3329 path->search_commit_root = 1; 3333 path->search_commit_root = 1;
3330 path->skip_locking = 1; 3334 path->skip_locking = 1;
@@ -3645,14 +3649,11 @@ int prepare_to_relocate(struct reloc_control *rc)
3645 * btrfs_init_reloc_root will use them when there 3649 * btrfs_init_reloc_root will use them when there
3646 * is no reservation in transaction handle. 3650 * is no reservation in transaction handle.
3647 */ 3651 */
3648 ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv, 3652 ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
3649 rc->extent_root->nodesize * 256); 3653 rc->extent_root->nodesize * 256);
3650 if (ret) 3654 if (ret)
3651 return ret; 3655 return ret;
3652 3656
3653 rc->block_rsv->refill_used = 1;
3654 btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
3655
3656 memset(&rc->cluster, 0, sizeof(rc->cluster)); 3657 memset(&rc->cluster, 0, sizeof(rc->cluster));
3657 rc->search_start = rc->block_group->key.objectid; 3658 rc->search_start = rc->block_group->key.objectid;
3658 rc->extents_found = 0; 3659 rc->extents_found = 0;
@@ -3777,8 +3778,7 @@ restart:
3777 } 3778 }
3778 } 3779 }
3779 3780
3780 ret = btrfs_block_rsv_check(trans, rc->extent_root, 3781 ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
3781 rc->block_rsv, 0, 5);
3782 if (ret < 0) { 3782 if (ret < 0) {
3783 if (ret != -EAGAIN) { 3783 if (ret != -EAGAIN) {
3784 err = ret; 3784 err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..c27bcb67f330 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -29,15 +33,12 @@
29 * any can be found. 33 * any can be found.
30 * 34 *
31 * Future enhancements: 35 * Future enhancements:
32 * - To enhance the performance, better read-ahead strategies for the
33 * extent-tree can be employed.
34 * - In case an unrepairable extent is encountered, track which files are 36 * - In case an unrepairable extent is encountered, track which files are
35 * affected and report them 37 * affected and report them
36 * - In case of a read error on files with nodatasum, map the file and read 38 * - In case of a read error on files with nodatasum, map the file and read
37 * the extent to trigger a writeback of the good copy 39 * the extent to trigger a writeback of the good copy
38 * - track and record media errors, throw out bad devices 40 * - track and record media errors, throw out bad devices
39 * - add a mode to also read unallocated space 41 * - add a mode to also read unallocated space
40 * - make the prefetch cancellable
41 */ 42 */
42 43
43struct scrub_bio; 44struct scrub_bio;
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
63struct scrub_page { 64struct scrub_page {
64 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
65 u64 generation; 66 u64 generation;
66 u64 mirror_num; 67 int mirror_num;
67 int have_csum; 68 int have_csum;
68 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
69}; 70};
@@ -87,6 +88,7 @@ struct scrub_dev {
87 int first_free; 88 int first_free;
88 int curr; 89 int curr;
89 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
90 spinlock_t list_lock; 92 spinlock_t list_lock;
91 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
92 u16 csum_size; 94 u16 csum_size;
@@ -100,6 +102,27 @@ struct scrub_dev {
100 spinlock_t stat_lock; 102 spinlock_t stat_lock;
101}; 103};
102 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
103static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
104{ 127{
105 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
175 198
176 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
177 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
178 else 201 else
179 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
180 } 203 }
181 sdev->first_free = 0; 204 sdev->first_free = 0;
182 sdev->curr = -1; 205 sdev->curr = -1;
183 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
184 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
185 sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
186 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
187 211
188 spin_lock_init(&sdev->list_lock); 212 spin_lock_init(&sdev->list_lock);
@@ -195,24 +219,366 @@ nomem:
195 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
196} 220}
197 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 if (IS_ERR(ipath)) {
260 ret = PTR_ERR(ipath);
261 ipath = NULL;
262 goto err;
263 }
264 ret = paths_from_inode(inum, ipath);
265
266 if (ret < 0)
267 goto err;
268
269 /*
270 * we deliberately ignore the bit ipath might have been too small to
271 * hold all of the paths here
272 */
273 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
274 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
275 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
276 "length %llu, links %u (path: %s)\n", swarn->errstr,
277 swarn->logical, swarn->dev->name,
278 (unsigned long long)swarn->sector, root, inum, offset,
279 min(isize - offset, (u64)PAGE_SIZE), nlink,
280 (char *)(unsigned long)ipath->fspath->val[i]);
281
282 free_ipath(ipath);
283 return 0;
284
285err:
286 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
287 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
288 "resolving failed with ret=%d\n", swarn->errstr,
289 swarn->logical, swarn->dev->name,
290 (unsigned long long)swarn->sector, root, inum, offset, ret);
291
292 free_ipath(ipath);
293 return 0;
294}
295
296static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
297 int ix)
298{
299 struct btrfs_device *dev = sbio->sdev->dev;
300 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
301 struct btrfs_path *path;
302 struct btrfs_key found_key;
303 struct extent_buffer *eb;
304 struct btrfs_extent_item *ei;
305 struct scrub_warning swarn;
306 u32 item_size;
307 int ret;
308 u64 ref_root;
309 u8 ref_level;
310 unsigned long ptr = 0;
311 const int bufsize = 4096;
312 u64 extent_offset;
313
314 path = btrfs_alloc_path();
315
316 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
317 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
318 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
319 swarn.logical = sbio->logical + ix * PAGE_SIZE;
320 swarn.errstr = errstr;
321 swarn.dev = dev;
322 swarn.msg_bufsize = bufsize;
323 swarn.scratch_bufsize = bufsize;
324
325 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
326 goto out;
327
328 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
329 if (ret < 0)
330 goto out;
331
332 extent_offset = swarn.logical - found_key.objectid;
333 swarn.extent_item_size = found_key.offset;
334
335 eb = path->nodes[0];
336 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
337 item_size = btrfs_item_size_nr(eb, path->slots[0]);
338
339 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
340 do {
341 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
342 &ref_root, &ref_level);
343 printk(KERN_WARNING "%s at logical %llu on dev %s, "
344 "sector %llu: metadata %s (level %d) in tree "
345 "%llu\n", errstr, swarn.logical, dev->name,
346 (unsigned long long)swarn.sector,
347 ref_level ? "node" : "leaf",
348 ret < 0 ? -1 : ref_level,
349 ret < 0 ? -1 : ref_root);
350 } while (ret != 1);
351 } else {
352 swarn.path = path;
353 iterate_extent_inodes(fs_info, path, found_key.objectid,
354 extent_offset,
355 scrub_print_warning_inode, &swarn);
356 }
357
358out:
359 btrfs_free_path(path);
360 kfree(swarn.scratch_buf);
361 kfree(swarn.msg_buf);
362}
363
364static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
365{
366 struct page *page = NULL;
367 unsigned long index;
368 struct scrub_fixup_nodatasum *fixup = ctx;
369 int ret;
370 int corrected = 0;
371 struct btrfs_key key;
372 struct inode *inode = NULL;
373 u64 end = offset + PAGE_SIZE - 1;
374 struct btrfs_root *local_root;
375
376 key.objectid = root;
377 key.type = BTRFS_ROOT_ITEM_KEY;
378 key.offset = (u64)-1;
379 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
380 if (IS_ERR(local_root))
381 return PTR_ERR(local_root);
382
383 key.type = BTRFS_INODE_ITEM_KEY;
384 key.objectid = inum;
385 key.offset = 0;
386 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
387 if (IS_ERR(inode))
388 return PTR_ERR(inode);
389
390 index = offset >> PAGE_CACHE_SHIFT;
391
392 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
393 if (!page) {
394 ret = -ENOMEM;
395 goto out;
396 }
397
398 if (PageUptodate(page)) {
399 struct btrfs_mapping_tree *map_tree;
400 if (PageDirty(page)) {
401 /*
402 * we need to write the data to the defect sector. the
403 * data that was in that sector is not in memory,
404 * because the page was modified. we must not write the
405 * modified page to that sector.
406 *
407 * TODO: what could be done here: wait for the delalloc
408 * runner to write out that page (might involve
409 * COW) and see whether the sector is still
410 * referenced afterwards.
411 *
412 * For the meantime, we'll treat this error
413 * incorrectable, although there is a chance that a
414 * later scrub will find the bad sector again and that
415 * there's no dirty page in memory, then.
416 */
417 ret = -EIO;
418 goto out;
419 }
420 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
421 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
422 fixup->logical, page,
423 fixup->mirror_num);
424 unlock_page(page);
425 corrected = !ret;
426 } else {
427 /*
428 * we need to get good data first. the general readpage path
429 * will call repair_io_failure for us, we just have to make
430 * sure we read the bad mirror.
431 */
432 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
433 EXTENT_DAMAGED, GFP_NOFS);
434 if (ret) {
435 /* set_extent_bits should give proper error */
436 WARN_ON(ret > 0);
437 if (ret > 0)
438 ret = -EFAULT;
439 goto out;
440 }
441
442 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
443 btrfs_get_extent,
444 fixup->mirror_num);
445 wait_on_page_locked(page);
446
447 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
448 end, EXTENT_DAMAGED, 0, NULL);
449 if (!corrected)
450 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
451 EXTENT_DAMAGED, GFP_NOFS);
452 }
453
454out:
455 if (page)
456 put_page(page);
457 if (inode)
458 iput(inode);
459
460 if (ret < 0)
461 return ret;
462
463 if (ret == 0 && corrected) {
464 /*
465 * we only need to call readpage for one of the inodes belonging
466 * to this extent. so make iterate_extent_inodes stop
467 */
468 return 1;
469 }
470
471 return -EIO;
472}
473
474static void scrub_fixup_nodatasum(struct btrfs_work *work)
475{
476 int ret;
477 struct scrub_fixup_nodatasum *fixup;
478 struct scrub_dev *sdev;
479 struct btrfs_trans_handle *trans = NULL;
480 struct btrfs_fs_info *fs_info;
481 struct btrfs_path *path;
482 int uncorrectable = 0;
483
484 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
485 sdev = fixup->sdev;
486 fs_info = fixup->root->fs_info;
487
488 path = btrfs_alloc_path();
489 if (!path) {
490 spin_lock(&sdev->stat_lock);
491 ++sdev->stat.malloc_errors;
492 spin_unlock(&sdev->stat_lock);
493 uncorrectable = 1;
494 goto out;
495 }
496
497 trans = btrfs_join_transaction(fixup->root);
498 if (IS_ERR(trans)) {
499 uncorrectable = 1;
500 goto out;
501 }
502
503 /*
504 * the idea is to trigger a regular read through the standard path. we
505 * read a page from the (failed) logical address by specifying the
506 * corresponding copynum of the failed sector. thus, that readpage is
507 * expected to fail.
508 * that is the point where on-the-fly error correction will kick in
509 * (once it's finished) and rewrite the failed sector if a good copy
510 * can be found.
511 */
512 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
513 path, scrub_fixup_readpage,
514 fixup);
515 if (ret < 0) {
516 uncorrectable = 1;
517 goto out;
518 }
519 WARN_ON(ret != 1);
520
521 spin_lock(&sdev->stat_lock);
522 ++sdev->stat.corrected_errors;
523 spin_unlock(&sdev->stat_lock);
524
525out:
526 if (trans && !IS_ERR(trans))
527 btrfs_end_transaction(trans, fixup->root);
528 if (uncorrectable) {
529 spin_lock(&sdev->stat_lock);
530 ++sdev->stat.uncorrectable_errors;
531 spin_unlock(&sdev->stat_lock);
532 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
533 "(nodatasum) error at logical %llu\n",
534 fixup->logical);
535 }
536
537 btrfs_free_path(path);
538 kfree(fixup);
539
540 /* see caller why we're pretending to be paused in the scrub counters */
541 mutex_lock(&fs_info->scrub_lock);
542 atomic_dec(&fs_info->scrubs_running);
543 atomic_dec(&fs_info->scrubs_paused);
544 mutex_unlock(&fs_info->scrub_lock);
545 atomic_dec(&sdev->fixup_cnt);
546 wake_up(&fs_info->scrub_pause_wait);
547 wake_up(&sdev->list_wait);
548}
549
198/* 550/*
199 * scrub_recheck_error gets called when either verification of the page 551 * scrub_recheck_error gets called when either verification of the page
200 * failed or the bio failed to read, e.g. with EIO. In the latter case, 552 * failed or the bio failed to read, e.g. with EIO. In the latter case,
201 * recheck_error gets called for every page in the bio, even though only 553 * recheck_error gets called for every page in the bio, even though only
202 * one may be bad 554 * one may be bad
203 */ 555 */
204static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 556static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
205{ 557{
558 struct scrub_dev *sdev = sbio->sdev;
559 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
560 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
561 DEFAULT_RATELIMIT_BURST);
562
206 if (sbio->err) { 563 if (sbio->err) {
207 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 564 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
208 (sbio->physical + ix * PAGE_SIZE) >> 9,
209 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 565 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
210 if (scrub_fixup_check(sbio, ix) == 0) 566 if (scrub_fixup_check(sbio, ix) == 0)
211 return; 567 return 0;
212 } 568 }
569 if (__ratelimit(&_rs))
570 scrub_print_warning("i/o error", sbio, ix);
571 } else {
572 if (__ratelimit(&_rs))
573 scrub_print_warning("checksum error", sbio, ix);
213 } 574 }
214 575
576 spin_lock(&sdev->stat_lock);
577 ++sdev->stat.read_errors;
578 spin_unlock(&sdev->stat_lock);
579
215 scrub_fixup(sbio, ix); 580 scrub_fixup(sbio, ix);
581 return 1;
216} 582}
217 583
218static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 584static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -250,7 +616,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
250 struct scrub_dev *sdev = sbio->sdev; 616 struct scrub_dev *sdev = sbio->sdev;
251 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 617 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
252 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 618 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
253 struct btrfs_multi_bio *multi = NULL; 619 struct btrfs_bio *bbio = NULL;
620 struct scrub_fixup_nodatasum *fixup;
254 u64 logical = sbio->logical + ix * PAGE_SIZE; 621 u64 logical = sbio->logical + ix * PAGE_SIZE;
255 u64 length; 622 u64 length;
256 int i; 623 int i;
@@ -259,38 +626,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
259 626
260 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 627 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
261 (sbio->spag[ix].have_csum == 0)) { 628 (sbio->spag[ix].have_csum == 0)) {
629 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
630 if (!fixup)
631 goto uncorrectable;
632 fixup->sdev = sdev;
633 fixup->logical = logical;
634 fixup->root = fs_info->extent_root;
635 fixup->mirror_num = sbio->spag[ix].mirror_num;
262 /* 636 /*
263 * nodatasum, don't try to fix anything 637 * increment scrubs_running to prevent cancel requests from
264 * FIXME: we can do better, open the inode and trigger a 638 * completing as long as a fixup worker is running. we must also
265 * writeback 639 * increment scrubs_paused to prevent deadlocking on pause
640 * requests used for transactions commits (as the worker uses a
641 * transaction context). it is safe to regard the fixup worker
642 * as paused for all matters practical. effectively, we only
643 * avoid cancellation requests from completing.
266 */ 644 */
267 goto uncorrectable; 645 mutex_lock(&fs_info->scrub_lock);
646 atomic_inc(&fs_info->scrubs_running);
647 atomic_inc(&fs_info->scrubs_paused);
648 mutex_unlock(&fs_info->scrub_lock);
649 atomic_inc(&sdev->fixup_cnt);
650 fixup->work.func = scrub_fixup_nodatasum;
651 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
652 return;
268 } 653 }
269 654
270 length = PAGE_SIZE; 655 length = PAGE_SIZE;
271 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 656 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
272 &multi, 0); 657 &bbio, 0);
273 if (ret || !multi || length < PAGE_SIZE) { 658 if (ret || !bbio || length < PAGE_SIZE) {
274 printk(KERN_ERR 659 printk(KERN_ERR
275 "scrub_fixup: btrfs_map_block failed us for %llu\n", 660 "scrub_fixup: btrfs_map_block failed us for %llu\n",
276 (unsigned long long)logical); 661 (unsigned long long)logical);
277 WARN_ON(1); 662 WARN_ON(1);
663 kfree(bbio);
278 return; 664 return;
279 } 665 }
280 666
281 if (multi->num_stripes == 1) 667 if (bbio->num_stripes == 1)
282 /* there aren't any replicas */ 668 /* there aren't any replicas */
283 goto uncorrectable; 669 goto uncorrectable;
284 670
285 /* 671 /*
286 * first find a good copy 672 * first find a good copy
287 */ 673 */
288 for (i = 0; i < multi->num_stripes; ++i) { 674 for (i = 0; i < bbio->num_stripes; ++i) {
289 if (i == sbio->spag[ix].mirror_num) 675 if (i + 1 == sbio->spag[ix].mirror_num)
290 continue; 676 continue;
291 677
292 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 678 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
293 multi->stripes[i].physical >> 9, 679 bbio->stripes[i].physical >> 9,
294 sbio->bio->bi_io_vec[ix].bv_page)) { 680 sbio->bio->bi_io_vec[ix].bv_page)) {
295 /* I/O-error, this is not a good copy */ 681 /* I/O-error, this is not a good copy */
296 continue; 682 continue;
@@ -299,7 +685,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
299 if (scrub_fixup_check(sbio, ix) == 0) 685 if (scrub_fixup_check(sbio, ix) == 0)
300 break; 686 break;
301 } 687 }
302 if (i == multi->num_stripes) 688 if (i == bbio->num_stripes)
303 goto uncorrectable; 689 goto uncorrectable;
304 690
305 if (!sdev->readonly) { 691 if (!sdev->readonly) {
@@ -314,25 +700,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
314 } 700 }
315 } 701 }
316 702
317 kfree(multi); 703 kfree(bbio);
318 spin_lock(&sdev->stat_lock); 704 spin_lock(&sdev->stat_lock);
319 ++sdev->stat.corrected_errors; 705 ++sdev->stat.corrected_errors;
320 spin_unlock(&sdev->stat_lock); 706 spin_unlock(&sdev->stat_lock);
321 707
322 if (printk_ratelimit()) 708 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
323 printk(KERN_ERR "btrfs: fixed up at %llu\n", 709 (unsigned long long)logical);
324 (unsigned long long)logical);
325 return; 710 return;
326 711
327uncorrectable: 712uncorrectable:
328 kfree(multi); 713 kfree(bbio);
329 spin_lock(&sdev->stat_lock); 714 spin_lock(&sdev->stat_lock);
330 ++sdev->stat.uncorrectable_errors; 715 ++sdev->stat.uncorrectable_errors;
331 spin_unlock(&sdev->stat_lock); 716 spin_unlock(&sdev->stat_lock);
332 717
333 if (printk_ratelimit()) 718 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
334 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 719 "logical %llu\n", (unsigned long long)logical);
335 (unsigned long long)logical);
336} 720}
337 721
338static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 722static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -382,8 +766,14 @@ static void scrub_checksum(struct btrfs_work *work)
382 int ret; 766 int ret;
383 767
384 if (sbio->err) { 768 if (sbio->err) {
769 ret = 0;
385 for (i = 0; i < sbio->count; ++i) 770 for (i = 0; i < sbio->count; ++i)
386 scrub_recheck_error(sbio, i); 771 ret |= scrub_recheck_error(sbio, i);
772 if (!ret) {
773 spin_lock(&sdev->stat_lock);
774 ++sdev->stat.unverified_errors;
775 spin_unlock(&sdev->stat_lock);
776 }
387 777
388 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 778 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
389 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 779 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +786,6 @@ static void scrub_checksum(struct btrfs_work *work)
396 bi->bv_offset = 0; 786 bi->bv_offset = 0;
397 bi->bv_len = PAGE_SIZE; 787 bi->bv_len = PAGE_SIZE;
398 } 788 }
399
400 spin_lock(&sdev->stat_lock);
401 ++sdev->stat.read_errors;
402 spin_unlock(&sdev->stat_lock);
403 goto out; 789 goto out;
404 } 790 }
405 for (i = 0; i < sbio->count; ++i) { 791 for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +806,14 @@ static void scrub_checksum(struct btrfs_work *work)
420 WARN_ON(1); 806 WARN_ON(1);
421 } 807 }
422 kunmap_atomic(buffer, KM_USER0); 808 kunmap_atomic(buffer, KM_USER0);
423 if (ret) 809 if (ret) {
424 scrub_recheck_error(sbio, i); 810 ret = scrub_recheck_error(sbio, i);
811 if (!ret) {
812 spin_lock(&sdev->stat_lock);
813 ++sdev->stat.unverified_errors;
814 spin_unlock(&sdev->stat_lock);
815 }
816 }
425 } 817 }
426 818
427out: 819out:
@@ -557,57 +949,27 @@ static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
557static int scrub_submit(struct scrub_dev *sdev) 949static int scrub_submit(struct scrub_dev *sdev)
558{ 950{
559 struct scrub_bio *sbio; 951 struct scrub_bio *sbio;
560 struct bio *bio;
561 int i;
562 952
563 if (sdev->curr == -1) 953 if (sdev->curr == -1)
564 return 0; 954 return 0;
565 955
566 sbio = sdev->bios[sdev->curr]; 956 sbio = sdev->bios[sdev->curr];
567
568 bio = bio_alloc(GFP_NOFS, sbio->count);
569 if (!bio)
570 goto nomem;
571
572 bio->bi_private = sbio;
573 bio->bi_end_io = scrub_bio_end_io;
574 bio->bi_bdev = sdev->dev->bdev;
575 bio->bi_sector = sbio->physical >> 9;
576
577 for (i = 0; i < sbio->count; ++i) {
578 struct page *page;
579 int ret;
580
581 page = alloc_page(GFP_NOFS);
582 if (!page)
583 goto nomem;
584
585 ret = bio_add_page(bio, page, PAGE_SIZE, 0);
586 if (!ret) {
587 __free_page(page);
588 goto nomem;
589 }
590 }
591
592 sbio->err = 0; 957 sbio->err = 0;
593 sdev->curr = -1; 958 sdev->curr = -1;
594 atomic_inc(&sdev->in_flight); 959 atomic_inc(&sdev->in_flight);
595 960
596 submit_bio(READ, bio); 961 submit_bio(READ, sbio->bio);
597 962
598 return 0; 963 return 0;
599
600nomem:
601 scrub_free_bio(bio);
602
603 return -ENOMEM;
604} 964}
605 965
606static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 966static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
607 u64 physical, u64 flags, u64 gen, u64 mirror_num, 967 u64 physical, u64 flags, u64 gen, int mirror_num,
608 u8 *csum, int force) 968 u8 *csum, int force)
609{ 969{
610 struct scrub_bio *sbio; 970 struct scrub_bio *sbio;
971 struct page *page;
972 int ret;
611 973
612again: 974again:
613 /* 975 /*
@@ -628,12 +990,22 @@ again:
628 } 990 }
629 sbio = sdev->bios[sdev->curr]; 991 sbio = sdev->bios[sdev->curr];
630 if (sbio->count == 0) { 992 if (sbio->count == 0) {
993 struct bio *bio;
994
631 sbio->physical = physical; 995 sbio->physical = physical;
632 sbio->logical = logical; 996 sbio->logical = logical;
997 bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
998 if (!bio)
999 return -ENOMEM;
1000
1001 bio->bi_private = sbio;
1002 bio->bi_end_io = scrub_bio_end_io;
1003 bio->bi_bdev = sdev->dev->bdev;
1004 bio->bi_sector = sbio->physical >> 9;
1005 sbio->err = 0;
1006 sbio->bio = bio;
633 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical || 1007 } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
634 sbio->logical + sbio->count * PAGE_SIZE != logical) { 1008 sbio->logical + sbio->count * PAGE_SIZE != logical) {
635 int ret;
636
637 ret = scrub_submit(sdev); 1009 ret = scrub_submit(sdev);
638 if (ret) 1010 if (ret)
639 return ret; 1011 return ret;
@@ -643,6 +1015,20 @@ again:
643 sbio->spag[sbio->count].generation = gen; 1015 sbio->spag[sbio->count].generation = gen;
644 sbio->spag[sbio->count].have_csum = 0; 1016 sbio->spag[sbio->count].have_csum = 0;
645 sbio->spag[sbio->count].mirror_num = mirror_num; 1017 sbio->spag[sbio->count].mirror_num = mirror_num;
1018
1019 page = alloc_page(GFP_NOFS);
1020 if (!page)
1021 return -ENOMEM;
1022
1023 ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
1024 if (!ret) {
1025 __free_page(page);
1026 ret = scrub_submit(sdev);
1027 if (ret)
1028 return ret;
1029 goto again;
1030 }
1031
646 if (csum) { 1032 if (csum) {
647 sbio->spag[sbio->count].have_csum = 1; 1033 sbio->spag[sbio->count].have_csum = 1;
648 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size); 1034 memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
@@ -701,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
701 1087
702/* scrub extent tries to collect up to 64 kB for each bio */ 1088/* scrub extent tries to collect up to 64 kB for each bio */
703static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1089static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
704 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1090 u64 physical, u64 flags, u64 gen, int mirror_num)
705{ 1091{
706 int ret; 1092 int ret;
707 u8 csum[BTRFS_CSUM_SIZE]; 1093 u8 csum[BTRFS_CSUM_SIZE];
@@ -741,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
741 int slot; 1127 int slot;
742 int i; 1128 int i;
743 u64 nstripes; 1129 u64 nstripes;
744 int start_stripe;
745 struct extent_buffer *l; 1130 struct extent_buffer *l;
746 struct btrfs_key key; 1131 struct btrfs_key key;
747 u64 physical; 1132 u64 physical;
748 u64 logical; 1133 u64 logical;
749 u64 generation; 1134 u64 generation;
750 u64 mirror_num; 1135 int mirror_num;
1136 struct reada_control *reada1;
1137 struct reada_control *reada2;
1138 struct btrfs_key key_start;
1139 struct btrfs_key key_end;
751 1140
752 u64 increment = map->stripe_len; 1141 u64 increment = map->stripe_len;
753 u64 offset; 1142 u64 offset;
@@ -758,102 +1147,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1147 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1148 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1149 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1150 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1152 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1153 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1154 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1155 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1156 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1157 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1158 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1159 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1160 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1161 mirror_num = num % map->num_stripes + 1;
773 } else { 1162 } else {
774 increment = map->stripe_len; 1163 increment = map->stripe_len;
775 mirror_num = 0; 1164 mirror_num = 1;
776 } 1165 }
777 1166
778 path = btrfs_alloc_path(); 1167 path = btrfs_alloc_path();
779 if (!path) 1168 if (!path)
780 return -ENOMEM; 1169 return -ENOMEM;
781 1170
782 path->reada = 2;
783 path->search_commit_root = 1; 1171 path->search_commit_root = 1;
784 path->skip_locking = 1; 1172 path->skip_locking = 1;
785 1173
786 /* 1174 /*
787 * find all extents for each stripe and just read them to get 1175 * trigger the readahead for extent tree csum tree and wait for
788 * them into the page cache 1176 * completion. During readahead, the scrub is officially paused
789 * FIXME: we can do better. build a more intelligent prefetching 1177 * to not hold off transaction commits
790 */ 1178 */
791 logical = base + offset; 1179 logical = base + offset;
792 physical = map->stripes[num].physical;
793 ret = 0;
794 for (i = 0; i < nstripes; ++i) {
795 key.objectid = logical;
796 key.type = BTRFS_EXTENT_ITEM_KEY;
797 key.offset = (u64)0;
798
799 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
800 if (ret < 0)
801 goto out_noplug;
802
803 /*
804 * we might miss half an extent here, but that doesn't matter,
805 * as it's only the prefetch
806 */
807 while (1) {
808 l = path->nodes[0];
809 slot = path->slots[0];
810 if (slot >= btrfs_header_nritems(l)) {
811 ret = btrfs_next_leaf(root, path);
812 if (ret == 0)
813 continue;
814 if (ret < 0)
815 goto out_noplug;
816 1180
817 break; 1181 wait_event(sdev->list_wait,
818 } 1182 atomic_read(&sdev->in_flight) == 0);
819 btrfs_item_key_to_cpu(l, &key, slot); 1183 atomic_inc(&fs_info->scrubs_paused);
1184 wake_up(&fs_info->scrub_pause_wait);
820 1185
821 if (key.objectid >= logical + map->stripe_len) 1186 /* FIXME it might be better to start readahead at commit root */
822 break; 1187 key_start.objectid = logical;
1188 key_start.type = BTRFS_EXTENT_ITEM_KEY;
1189 key_start.offset = (u64)0;
1190 key_end.objectid = base + offset + nstripes * increment;
1191 key_end.type = BTRFS_EXTENT_ITEM_KEY;
1192 key_end.offset = (u64)0;
1193 reada1 = btrfs_reada_add(root, &key_start, &key_end);
1194
1195 key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1196 key_start.type = BTRFS_EXTENT_CSUM_KEY;
1197 key_start.offset = logical;
1198 key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
1199 key_end.type = BTRFS_EXTENT_CSUM_KEY;
1200 key_end.offset = base + offset + nstripes * increment;
1201 reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
1202
1203 if (!IS_ERR(reada1))
1204 btrfs_reada_wait(reada1);
1205 if (!IS_ERR(reada2))
1206 btrfs_reada_wait(reada2);
823 1207
824 path->slots[0]++; 1208 mutex_lock(&fs_info->scrub_lock);
825 } 1209 while (atomic_read(&fs_info->scrub_pause_req)) {
826 btrfs_release_path(path); 1210 mutex_unlock(&fs_info->scrub_lock);
827 logical += increment; 1211 wait_event(fs_info->scrub_pause_wait,
828 physical += map->stripe_len; 1212 atomic_read(&fs_info->scrub_pause_req) == 0);
829 cond_resched(); 1213 mutex_lock(&fs_info->scrub_lock);
830 } 1214 }
1215 atomic_dec(&fs_info->scrubs_paused);
1216 mutex_unlock(&fs_info->scrub_lock);
1217 wake_up(&fs_info->scrub_pause_wait);
831 1218
832 /* 1219 /*
833 * collect all data csums for the stripe to avoid seeking during 1220 * collect all data csums for the stripe to avoid seeking during
834 * the scrub. This might currently (crc32) end up to be about 1MB 1221 * the scrub. This might currently (crc32) end up to be about 1MB
835 */ 1222 */
836 start_stripe = 0;
837 blk_start_plug(&plug); 1223 blk_start_plug(&plug);
838again:
839 logical = base + offset + start_stripe * increment;
840 for (i = start_stripe; i < nstripes; ++i) {
841 ret = btrfs_lookup_csums_range(csum_root, logical,
842 logical + map->stripe_len - 1,
843 &sdev->csum_list, 1);
844 if (ret)
845 goto out;
846 1224
847 logical += increment;
848 cond_resched();
849 }
850 /* 1225 /*
851 * now find all extents for each stripe and scrub them 1226 * now find all extents for each stripe and scrub them
852 */ 1227 */
853 logical = base + offset + start_stripe * increment; 1228 logical = base + offset;
854 physical = map->stripes[num].physical + start_stripe * map->stripe_len; 1229 physical = map->stripes[num].physical;
855 ret = 0; 1230 ret = 0;
856 for (i = start_stripe; i < nstripes; ++i) { 1231 for (i = 0; i < nstripes; ++i) {
857 /* 1232 /*
858 * canceled? 1233 * canceled?
859 */ 1234 */
@@ -882,11 +1257,14 @@ again:
882 atomic_dec(&fs_info->scrubs_paused); 1257 atomic_dec(&fs_info->scrubs_paused);
883 mutex_unlock(&fs_info->scrub_lock); 1258 mutex_unlock(&fs_info->scrub_lock);
884 wake_up(&fs_info->scrub_pause_wait); 1259 wake_up(&fs_info->scrub_pause_wait);
885 scrub_free_csums(sdev);
886 start_stripe = i;
887 goto again;
888 } 1260 }
889 1261
1262 ret = btrfs_lookup_csums_range(csum_root, logical,
1263 logical + map->stripe_len - 1,
1264 &sdev->csum_list, 1);
1265 if (ret)
1266 goto out;
1267
890 key.objectid = logical; 1268 key.objectid = logical;
891 key.type = BTRFS_EXTENT_ITEM_KEY; 1269 key.type = BTRFS_EXTENT_ITEM_KEY;
892 key.offset = (u64)0; 1270 key.offset = (u64)0;
@@ -982,7 +1360,6 @@ next:
982 1360
983out: 1361out:
984 blk_finish_plug(&plug); 1362 blk_finish_plug(&plug);
985out_noplug:
986 btrfs_free_path(path); 1363 btrfs_free_path(path);
987 return ret < 0 ? ret : 0; 1364 return ret < 0 ? ret : 0;
988} 1365}
@@ -1253,10 +1630,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1253 ret = scrub_enumerate_chunks(sdev, start, end); 1630 ret = scrub_enumerate_chunks(sdev, start, end);
1254 1631
1255 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1632 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1256
1257 atomic_dec(&fs_info->scrubs_running); 1633 atomic_dec(&fs_info->scrubs_running);
1258 wake_up(&fs_info->scrub_pause_wait); 1634 wake_up(&fs_info->scrub_pause_wait);
1259 1635
1636 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1637
1260 if (progress) 1638 if (progress)
1261 memcpy(progress, &sdev->stat, sizeof(*progress)); 1639 memcpy(progress, &sdev->stat, sizeof(*progress));
1262 1640
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..e28ad4baf483 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
40#include <linux/magic.h> 40#include <linux/magic.h>
41#include <linux/slab.h> 41#include <linux/slab.h>
42#include <linux/cleancache.h> 42#include <linux/cleancache.h>
43#include <linux/mnt_namespace.h>
43#include "compat.h" 44#include "compat.h"
44#include "delayed-inode.h" 45#include "delayed-inode.h"
45#include "ctree.h" 46#include "ctree.h"
@@ -58,6 +59,7 @@
58#include <trace/events/btrfs.h> 59#include <trace/events/btrfs.h>
59 60
60static const struct super_operations btrfs_super_ops; 61static const struct super_operations btrfs_super_ops;
62static struct file_system_type btrfs_fs_type;
61 63
62static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno, 64static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
63 char nbuf[16]) 65 char nbuf[16])
@@ -162,7 +164,7 @@ enum {
162 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, 164 Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
163 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, 165 Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
164 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, 166 Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
165 Opt_inode_cache, Opt_err, 167 Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
166}; 168};
167 169
168static match_table_t tokens = { 170static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
195 {Opt_subvolrootid, "subvolrootid=%d"}, 197 {Opt_subvolrootid, "subvolrootid=%d"},
196 {Opt_defrag, "autodefrag"}, 198 {Opt_defrag, "autodefrag"},
197 {Opt_inode_cache, "inode_cache"}, 199 {Opt_inode_cache, "inode_cache"},
200 {Opt_no_space_cache, "nospace_cache"},
201 {Opt_recovery, "recovery"},
198 {Opt_err, NULL}, 202 {Opt_err, NULL},
199}; 203};
200 204
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
206{ 210{
207 struct btrfs_fs_info *info = root->fs_info; 211 struct btrfs_fs_info *info = root->fs_info;
208 substring_t args[MAX_OPT_ARGS]; 212 substring_t args[MAX_OPT_ARGS];
209 char *p, *num, *orig; 213 char *p, *num, *orig = NULL;
214 u64 cache_gen;
210 int intarg; 215 int intarg;
211 int ret = 0; 216 int ret = 0;
212 char *compress_type; 217 char *compress_type;
213 bool compress_force = false; 218 bool compress_force = false;
214 219
220 cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
221 if (cache_gen)
222 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
223
215 if (!options) 224 if (!options)
216 return 0; 225 goto out;
217 226
218 /* 227 /*
219 * strsep changes the string, duplicate it because parse_options 228 * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
360 btrfs_set_opt(info->mount_opt, DISCARD); 369 btrfs_set_opt(info->mount_opt, DISCARD);
361 break; 370 break;
362 case Opt_space_cache: 371 case Opt_space_cache:
363 printk(KERN_INFO "btrfs: enabling disk space caching\n");
364 btrfs_set_opt(info->mount_opt, SPACE_CACHE); 372 btrfs_set_opt(info->mount_opt, SPACE_CACHE);
365 break; 373 break;
374 case Opt_no_space_cache:
375 printk(KERN_INFO "btrfs: disabling disk space caching\n");
376 btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
377 break;
366 case Opt_inode_cache: 378 case Opt_inode_cache:
367 printk(KERN_INFO "btrfs: enabling inode map caching\n"); 379 printk(KERN_INFO "btrfs: enabling inode map caching\n");
368 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE); 380 btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
381 printk(KERN_INFO "btrfs: enabling auto defrag"); 393 printk(KERN_INFO "btrfs: enabling auto defrag");
382 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG); 394 btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
383 break; 395 break;
396 case Opt_recovery:
397 printk(KERN_INFO "btrfs: enabling auto recovery");
398 btrfs_set_opt(info->mount_opt, RECOVERY);
399 break;
384 case Opt_err: 400 case Opt_err:
385 printk(KERN_INFO "btrfs: unrecognized mount option " 401 printk(KERN_INFO "btrfs: unrecognized mount option "
386 "'%s'\n", p); 402 "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
391 } 407 }
392 } 408 }
393out: 409out:
410 if (!ret && btrfs_test_opt(root, SPACE_CACHE))
411 printk(KERN_INFO "btrfs: disk space caching is enabled\n");
394 kfree(orig); 412 kfree(orig);
395 return ret; 413 return ret;
396} 414}
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
406 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices) 424 u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
407{ 425{
408 substring_t args[MAX_OPT_ARGS]; 426 substring_t args[MAX_OPT_ARGS];
409 char *opts, *orig, *p; 427 char *device_name, *opts, *orig, *p;
410 int error = 0; 428 int error = 0;
411 int intarg; 429 int intarg;
412 430
413 if (!options) 431 if (!options)
414 goto out; 432 return 0;
415 433
416 /* 434 /*
417 * strsep changes the string, duplicate it because parse_options 435 * strsep changes the string, duplicate it because parse_options
@@ -430,6 +448,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
430 token = match_token(p, tokens, args); 448 token = match_token(p, tokens, args);
431 switch (token) { 449 switch (token) {
432 case Opt_subvol: 450 case Opt_subvol:
451 kfree(*subvol_name);
433 *subvol_name = match_strdup(&args[0]); 452 *subvol_name = match_strdup(&args[0]);
434 break; 453 break;
435 case Opt_subvolid: 454 case Opt_subvolid:
@@ -457,29 +476,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
457 } 476 }
458 break; 477 break;
459 case Opt_device: 478 case Opt_device:
460 error = btrfs_scan_one_device(match_strdup(&args[0]), 479 device_name = match_strdup(&args[0]);
480 if (!device_name) {
481 error = -ENOMEM;
482 goto out;
483 }
484 error = btrfs_scan_one_device(device_name,
461 flags, holder, fs_devices); 485 flags, holder, fs_devices);
486 kfree(device_name);
462 if (error) 487 if (error)
463 goto out_free_opts; 488 goto out;
464 break; 489 break;
465 default: 490 default:
466 break; 491 break;
467 } 492 }
468 } 493 }
469 494
470 out_free_opts: 495out:
471 kfree(orig); 496 kfree(orig);
472 out:
473 /*
474 * If no subvolume name is specified we use the default one. Allocate
475 * a copy of the string "." here so that code later in the
476 * mount path doesn't care if it's the default volume or another one.
477 */
478 if (!*subvol_name) {
479 *subvol_name = kstrdup(".", GFP_KERNEL);
480 if (!*subvol_name)
481 return -ENOMEM;
482 }
483 return error; 497 return error;
484} 498}
485 499
@@ -492,7 +506,6 @@ static struct dentry *get_default_root(struct super_block *sb,
492 struct btrfs_path *path; 506 struct btrfs_path *path;
493 struct btrfs_key location; 507 struct btrfs_key location;
494 struct inode *inode; 508 struct inode *inode;
495 struct dentry *dentry;
496 u64 dir_id; 509 u64 dir_id;
497 int new = 0; 510 int new = 0;
498 511
@@ -517,7 +530,7 @@ static struct dentry *get_default_root(struct super_block *sb,
517 * will mount by default if we haven't been given a specific subvolume 530 * will mount by default if we haven't been given a specific subvolume
518 * to mount. 531 * to mount.
519 */ 532 */
520 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy); 533 dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
521 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); 534 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
522 if (IS_ERR(di)) { 535 if (IS_ERR(di)) {
523 btrfs_free_path(path); 536 btrfs_free_path(path);
@@ -566,29 +579,7 @@ setup_root:
566 return dget(sb->s_root); 579 return dget(sb->s_root);
567 } 580 }
568 581
569 if (new) { 582 return d_obtain_alias(inode);
570 const struct qstr name = { .name = "/", .len = 1 };
571
572 /*
573 * New inode, we need to make the dentry a sibling of s_root so
574 * everything gets cleaned up properly on unmount.
575 */
576 dentry = d_alloc(sb->s_root, &name);
577 if (!dentry) {
578 iput(inode);
579 return ERR_PTR(-ENOMEM);
580 }
581 d_splice_alias(inode, dentry);
582 } else {
583 /*
584 * We found the inode in cache, just find a dentry for it and
585 * put the reference to the inode we just got.
586 */
587 dentry = d_find_alias(inode);
588 iput(inode);
589 }
590
591 return dentry;
592} 583}
593 584
594static int btrfs_fill_super(struct super_block *sb, 585static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +710,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
719 seq_puts(seq, ",noacl"); 710 seq_puts(seq, ",noacl");
720 if (btrfs_test_opt(root, SPACE_CACHE)) 711 if (btrfs_test_opt(root, SPACE_CACHE))
721 seq_puts(seq, ",space_cache"); 712 seq_puts(seq, ",space_cache");
713 else
714 seq_puts(seq, ",nospace_cache");
722 if (btrfs_test_opt(root, CLEAR_CACHE)) 715 if (btrfs_test_opt(root, CLEAR_CACHE))
723 seq_puts(seq, ",clear_cache"); 716 seq_puts(seq, ",clear_cache");
724 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED)) 717 if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +746,111 @@ static int btrfs_set_super(struct super_block *s, void *data)
753 return set_anon_super(s, data); 746 return set_anon_super(s, data);
754} 747}
755 748
749/*
750 * subvolumes are identified by ino 256
751 */
752static inline int is_subvolume_inode(struct inode *inode)
753{
754 if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
755 return 1;
756 return 0;
757}
758
759/*
760 * This will strip out the subvol=%s argument for an argument string and add
761 * subvolid=0 to make sure we get the actual tree root for path walking to the
762 * subvol we want.
763 */
764static char *setup_root_args(char *args)
765{
766 unsigned copied = 0;
767 unsigned len = strlen(args) + 2;
768 char *pos;
769 char *ret;
770
771 /*
772 * We need the same args as before, but minus
773 *
774 * subvol=a
775 *
776 * and add
777 *
778 * subvolid=0
779 *
780 * which is a difference of 2 characters, so we allocate strlen(args) +
781 * 2 characters.
782 */
783 ret = kzalloc(len * sizeof(char), GFP_NOFS);
784 if (!ret)
785 return NULL;
786 pos = strstr(args, "subvol=");
787
788 /* This shouldn't happen, but just in case.. */
789 if (!pos) {
790 kfree(ret);
791 return NULL;
792 }
793
794 /*
795 * The subvol=<> arg is not at the front of the string, copy everybody
796 * up to that into ret.
797 */
798 if (pos != args) {
799 *pos = '\0';
800 strcpy(ret, args);
801 copied += strlen(args);
802 pos++;
803 }
804
805 strncpy(ret + copied, "subvolid=0", len - copied);
806
807 /* Length of subvolid=0 */
808 copied += 10;
809
810 /*
811 * If there is no , after the subvol= option then we know there's no
812 * other options and we can just return.
813 */
814 pos = strchr(pos, ',');
815 if (!pos)
816 return ret;
817
818 /* Copy the rest of the arguments into our buffer */
819 strncpy(ret + copied, pos, len - copied);
820 copied += strlen(pos);
821
822 return ret;
823}
824
825static struct dentry *mount_subvol(const char *subvol_name, int flags,
826 const char *device_name, char *data)
827{
828 struct dentry *root;
829 struct vfsmount *mnt;
830 char *newargs;
831
832 newargs = setup_root_args(data);
833 if (!newargs)
834 return ERR_PTR(-ENOMEM);
835 mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
836 newargs);
837 kfree(newargs);
838 if (IS_ERR(mnt))
839 return ERR_CAST(mnt);
840
841 root = mount_subtree(mnt, subvol_name);
842
843 if (!IS_ERR(root) && !is_subvolume_inode(root->d_inode)) {
844 struct super_block *s = root->d_sb;
845 dput(root);
846 root = ERR_PTR(-EINVAL);
847 deactivate_locked_super(s);
848 printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
849 subvol_name);
850 }
851
852 return root;
853}
756 854
757/* 855/*
758 * Find a superblock for the given device / mount point. 856 * Find a superblock for the given device / mount point.
@@ -767,7 +865,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
767 struct super_block *s; 865 struct super_block *s;
768 struct dentry *root; 866 struct dentry *root;
769 struct btrfs_fs_devices *fs_devices = NULL; 867 struct btrfs_fs_devices *fs_devices = NULL;
770 struct btrfs_root *tree_root = NULL;
771 struct btrfs_fs_info *fs_info = NULL; 868 struct btrfs_fs_info *fs_info = NULL;
772 fmode_t mode = FMODE_READ; 869 fmode_t mode = FMODE_READ;
773 char *subvol_name = NULL; 870 char *subvol_name = NULL;
@@ -781,21 +878,20 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
781 error = btrfs_parse_early_options(data, mode, fs_type, 878 error = btrfs_parse_early_options(data, mode, fs_type,
782 &subvol_name, &subvol_objectid, 879 &subvol_name, &subvol_objectid,
783 &subvol_rootid, &fs_devices); 880 &subvol_rootid, &fs_devices);
784 if (error) 881 if (error) {
882 kfree(subvol_name);
785 return ERR_PTR(error); 883 return ERR_PTR(error);
884 }
786 885
787 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); 886 if (subvol_name) {
788 if (error) 887 root = mount_subvol(subvol_name, flags, device_name, data);
789 goto error_free_subvol_name; 888 kfree(subvol_name);
889 return root;
890 }
790 891
791 error = btrfs_open_devices(fs_devices, mode, fs_type); 892 error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
792 if (error) 893 if (error)
793 goto error_free_subvol_name; 894 return ERR_PTR(error);
794
795 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
796 error = -EACCES;
797 goto error_close_devices;
798 }
799 895
800 /* 896 /*
801 * Setup a dummy root and fs_info for test/set super. This is because 897 * Setup a dummy root and fs_info for test/set super. This is because
@@ -804,19 +900,40 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
804 * then open_ctree will properly initialize everything later. 900 * then open_ctree will properly initialize everything later.
805 */ 901 */
806 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); 902 fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
807 tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS); 903 if (!fs_info)
808 if (!fs_info || !tree_root) { 904 return ERR_PTR(-ENOMEM);
905
906 fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
907 if (!fs_info->tree_root) {
809 error = -ENOMEM; 908 error = -ENOMEM;
810 goto error_close_devices; 909 goto error_fs_info;
811 } 910 }
812 fs_info->tree_root = tree_root; 911 fs_info->tree_root->fs_info = fs_info;
813 fs_info->fs_devices = fs_devices; 912 fs_info->fs_devices = fs_devices;
814 tree_root->fs_info = fs_info; 913
914 fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
915 fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
916 if (!fs_info->super_copy || !fs_info->super_for_commit) {
917 error = -ENOMEM;
918 goto error_fs_info;
919 }
920
921 error = btrfs_open_devices(fs_devices, mode, fs_type);
922 if (error)
923 goto error_fs_info;
924
925 if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
926 error = -EACCES;
927 goto error_close_devices;
928 }
815 929
816 bdev = fs_devices->latest_bdev; 930 bdev = fs_devices->latest_bdev;
817 s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root); 931 s = sget(fs_type, btrfs_test_super, btrfs_set_super,
818 if (IS_ERR(s)) 932 fs_info->tree_root);
819 goto error_s; 933 if (IS_ERR(s)) {
934 error = PTR_ERR(s);
935 goto error_close_devices;
936 }
820 937
821 if (s->s_root) { 938 if (s->s_root) {
822 if ((flags ^ s->s_flags) & MS_RDONLY) { 939 if ((flags ^ s->s_flags) & MS_RDONLY) {
@@ -826,75 +943,35 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
826 } 943 }
827 944
828 btrfs_close_devices(fs_devices); 945 btrfs_close_devices(fs_devices);
829 kfree(fs_info); 946 free_fs_info(fs_info);
830 kfree(tree_root);
831 } else { 947 } else {
832 char b[BDEVNAME_SIZE]; 948 char b[BDEVNAME_SIZE];
833 949
834 s->s_flags = flags | MS_NOSEC; 950 s->s_flags = flags | MS_NOSEC;
835 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); 951 strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
952 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
836 error = btrfs_fill_super(s, fs_devices, data, 953 error = btrfs_fill_super(s, fs_devices, data,
837 flags & MS_SILENT ? 1 : 0); 954 flags & MS_SILENT ? 1 : 0);
838 if (error) { 955 if (error) {
839 deactivate_locked_super(s); 956 deactivate_locked_super(s);
840 goto error_free_subvol_name; 957 return ERR_PTR(error);
841 } 958 }
842 959
843 btrfs_sb(s)->fs_info->bdev_holder = fs_type;
844 s->s_flags |= MS_ACTIVE; 960 s->s_flags |= MS_ACTIVE;
845 } 961 }
846 962
847 /* if they gave us a subvolume name bind mount into that */ 963 root = get_default_root(s, subvol_objectid);
848 if (strcmp(subvol_name, ".")) { 964 if (IS_ERR(root)) {
849 struct dentry *new_root; 965 deactivate_locked_super(s);
850 966 return root;
851 root = get_default_root(s, subvol_rootid);
852 if (IS_ERR(root)) {
853 error = PTR_ERR(root);
854 deactivate_locked_super(s);
855 goto error_free_subvol_name;
856 }
857
858 mutex_lock(&root->d_inode->i_mutex);
859 new_root = lookup_one_len(subvol_name, root,
860 strlen(subvol_name));
861 mutex_unlock(&root->d_inode->i_mutex);
862
863 if (IS_ERR(new_root)) {
864 dput(root);
865 deactivate_locked_super(s);
866 error = PTR_ERR(new_root);
867 goto error_free_subvol_name;
868 }
869 if (!new_root->d_inode) {
870 dput(root);
871 dput(new_root);
872 deactivate_locked_super(s);
873 error = -ENXIO;
874 goto error_free_subvol_name;
875 }
876 dput(root);
877 root = new_root;
878 } else {
879 root = get_default_root(s, subvol_objectid);
880 if (IS_ERR(root)) {
881 error = PTR_ERR(root);
882 deactivate_locked_super(s);
883 goto error_free_subvol_name;
884 }
885 } 967 }
886 968
887 kfree(subvol_name);
888 return root; 969 return root;
889 970
890error_s:
891 error = PTR_ERR(s);
892error_close_devices: 971error_close_devices:
893 btrfs_close_devices(fs_devices); 972 btrfs_close_devices(fs_devices);
894 kfree(fs_info); 973error_fs_info:
895 kfree(tree_root); 974 free_fs_info(fs_info);
896error_free_subvol_name:
897 kfree(subvol_name);
898 return ERR_PTR(error); 975 return ERR_PTR(error);
899} 976}
900 977
@@ -919,7 +996,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
919 if (root->fs_info->fs_devices->rw_devices == 0) 996 if (root->fs_info->fs_devices->rw_devices == 0)
920 return -EACCES; 997 return -EACCES;
921 998
922 if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) 999 if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
923 return -EINVAL; 1000 return -EINVAL;
924 1001
925 ret = btrfs_cleanup_fs_roots(root->fs_info); 1002 ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -980,7 +1057,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
980 int i = 0, nr_devices; 1057 int i = 0, nr_devices;
981 int ret; 1058 int ret;
982 1059
983 nr_devices = fs_info->fs_devices->rw_devices; 1060 nr_devices = fs_info->fs_devices->open_devices;
984 BUG_ON(!nr_devices); 1061 BUG_ON(!nr_devices);
985 1062
986 devices_info = kmalloc(sizeof(*devices_info) * nr_devices, 1063 devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
@@ -1002,8 +1079,8 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1002 else 1079 else
1003 min_stripe_size = BTRFS_STRIPE_LEN; 1080 min_stripe_size = BTRFS_STRIPE_LEN;
1004 1081
1005 list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { 1082 list_for_each_entry(device, &fs_devices->devices, dev_list) {
1006 if (!device->in_fs_metadata) 1083 if (!device->in_fs_metadata || !device->bdev)
1007 continue; 1084 continue;
1008 1085
1009 avail_space = device->total_bytes - device->bytes_used; 1086 avail_space = device->total_bytes - device->bytes_used;
@@ -1085,7 +1162,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
1085static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) 1162static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
1086{ 1163{
1087 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 1164 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
1088 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1165 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1089 struct list_head *head = &root->fs_info->space_info; 1166 struct list_head *head = &root->fs_info->space_info;
1090 struct btrfs_space_info *found; 1167 struct btrfs_space_info *found;
1091 u64 total_used = 0; 1168 u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..81376d94cd3c 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
55 struct btrfs_transaction *cur_trans; 55 struct btrfs_transaction *cur_trans;
56 56
57 spin_lock(&root->fs_info->trans_lock); 57 spin_lock(&root->fs_info->trans_lock);
58loop:
58 if (root->fs_info->trans_no_join) { 59 if (root->fs_info->trans_no_join) {
59 if (!nofail) { 60 if (!nofail) {
60 spin_unlock(&root->fs_info->trans_lock); 61 spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
75 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS); 76 cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
76 if (!cur_trans) 77 if (!cur_trans)
77 return -ENOMEM; 78 return -ENOMEM;
79
78 spin_lock(&root->fs_info->trans_lock); 80 spin_lock(&root->fs_info->trans_lock);
79 if (root->fs_info->running_transaction) { 81 if (root->fs_info->running_transaction) {
82 /*
83 * someone started a transaction after we unlocked. Make sure
84 * to redo the trans_no_join checks above
85 */
80 kmem_cache_free(btrfs_transaction_cachep, cur_trans); 86 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
81 cur_trans = root->fs_info->running_transaction; 87 cur_trans = root->fs_info->running_transaction;
82 atomic_inc(&cur_trans->use_count); 88 goto loop;
83 atomic_inc(&cur_trans->num_writers);
84 cur_trans->num_joined++;
85 spin_unlock(&root->fs_info->trans_lock);
86 return 0;
87 } 89 }
90
88 atomic_set(&cur_trans->num_writers, 1); 91 atomic_set(&cur_trans->num_writers, 1);
89 cur_trans->num_joined = 0; 92 cur_trans->num_joined = 0;
90 init_waitqueue_head(&cur_trans->writer_wait); 93 init_waitqueue_head(&cur_trans->writer_wait);
@@ -275,7 +278,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
275 */ 278 */
276 if (num_items > 0 && root != root->fs_info->chunk_root) { 279 if (num_items > 0 && root != root->fs_info->chunk_root) {
277 num_bytes = btrfs_calc_trans_metadata_size(root, num_items); 280 num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
278 ret = btrfs_block_rsv_add(NULL, root, 281 ret = btrfs_block_rsv_add(root,
279 &root->fs_info->trans_block_rsv, 282 &root->fs_info->trans_block_rsv,
280 num_bytes); 283 num_bytes);
281 if (ret) 284 if (ret)
@@ -418,8 +421,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
418 struct btrfs_root *root) 421 struct btrfs_root *root)
419{ 422{
420 int ret; 423 int ret;
421 ret = btrfs_block_rsv_check(trans, root, 424
422 &root->fs_info->global_block_rsv, 0, 5); 425 ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
423 return ret ? 1 : 0; 426 return ret ? 1 : 0;
424} 427}
425 428
@@ -427,17 +430,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
427 struct btrfs_root *root) 430 struct btrfs_root *root)
428{ 431{
429 struct btrfs_transaction *cur_trans = trans->transaction; 432 struct btrfs_transaction *cur_trans = trans->transaction;
433 struct btrfs_block_rsv *rsv = trans->block_rsv;
430 int updates; 434 int updates;
431 435
432 smp_mb(); 436 smp_mb();
433 if (cur_trans->blocked || cur_trans->delayed_refs.flushing) 437 if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
434 return 1; 438 return 1;
435 439
440 /*
441 * We need to do this in case we're deleting csums so the global block
442 * rsv get's used instead of the csum block rsv.
443 */
444 trans->block_rsv = NULL;
445
436 updates = trans->delayed_ref_updates; 446 updates = trans->delayed_ref_updates;
437 trans->delayed_ref_updates = 0; 447 trans->delayed_ref_updates = 0;
438 if (updates) 448 if (updates)
439 btrfs_run_delayed_refs(trans, root, updates); 449 btrfs_run_delayed_refs(trans, root, updates);
440 450
451 trans->block_rsv = rsv;
452
441 return should_end_transaction(trans, root); 453 return should_end_transaction(trans, root);
442} 454}
443 455
@@ -453,6 +465,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
453 return 0; 465 return 0;
454 } 466 }
455 467
468 btrfs_trans_release_metadata(trans, root);
469 trans->block_rsv = NULL;
456 while (count < 4) { 470 while (count < 4) {
457 unsigned long cur = trans->delayed_ref_updates; 471 unsigned long cur = trans->delayed_ref_updates;
458 trans->delayed_ref_updates = 0; 472 trans->delayed_ref_updates = 0;
@@ -473,8 +487,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
473 count++; 487 count++;
474 } 488 }
475 489
476 btrfs_trans_release_metadata(trans, root);
477
478 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && 490 if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
479 should_end_transaction(trans, root)) { 491 should_end_transaction(trans, root)) {
480 trans->transaction->blocked = 1; 492 trans->transaction->blocked = 1;
@@ -562,50 +574,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
562int btrfs_write_marked_extents(struct btrfs_root *root, 574int btrfs_write_marked_extents(struct btrfs_root *root,
563 struct extent_io_tree *dirty_pages, int mark) 575 struct extent_io_tree *dirty_pages, int mark)
564{ 576{
565 int ret;
566 int err = 0; 577 int err = 0;
567 int werr = 0; 578 int werr = 0;
568 struct page *page; 579 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
569 struct inode *btree_inode = root->fs_info->btree_inode;
570 u64 start = 0; 580 u64 start = 0;
571 u64 end; 581 u64 end;
572 unsigned long index;
573
574 while (1) {
575 ret = find_first_extent_bit(dirty_pages, start, &start, &end,
576 mark);
577 if (ret)
578 break;
579 while (start <= end) {
580 cond_resched();
581
582 index = start >> PAGE_CACHE_SHIFT;
583 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
584 page = find_get_page(btree_inode->i_mapping, index);
585 if (!page)
586 continue;
587
588 btree_lock_page_hook(page);
589 if (!page->mapping) {
590 unlock_page(page);
591 page_cache_release(page);
592 continue;
593 }
594 582
595 if (PageWriteback(page)) { 583 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
596 if (PageDirty(page)) 584 mark)) {
597 wait_on_page_writeback(page); 585 convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
598 else { 586 GFP_NOFS);
599 unlock_page(page); 587 err = filemap_fdatawrite_range(mapping, start, end);
600 page_cache_release(page); 588 if (err)
601 continue; 589 werr = err;
602 } 590 cond_resched();
603 } 591 start = end + 1;
604 err = write_one_page(page, 0);
605 if (err)
606 werr = err;
607 page_cache_release(page);
608 }
609 } 592 }
610 if (err) 593 if (err)
611 werr = err; 594 werr = err;
@@ -621,39 +604,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
621int btrfs_wait_marked_extents(struct btrfs_root *root, 604int btrfs_wait_marked_extents(struct btrfs_root *root,
622 struct extent_io_tree *dirty_pages, int mark) 605 struct extent_io_tree *dirty_pages, int mark)
623{ 606{
624 int ret;
625 int err = 0; 607 int err = 0;
626 int werr = 0; 608 int werr = 0;
627 struct page *page; 609 struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
628 struct inode *btree_inode = root->fs_info->btree_inode;
629 u64 start = 0; 610 u64 start = 0;
630 u64 end; 611 u64 end;
631 unsigned long index;
632 612
633 while (1) { 613 while (!find_first_extent_bit(dirty_pages, start, &start, &end,
634 ret = find_first_extent_bit(dirty_pages, start, &start, &end, 614 EXTENT_NEED_WAIT)) {
635 mark); 615 clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
636 if (ret) 616 err = filemap_fdatawait_range(mapping, start, end);
637 break; 617 if (err)
638 618 werr = err;
639 clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); 619 cond_resched();
640 while (start <= end) { 620 start = end + 1;
641 index = start >> PAGE_CACHE_SHIFT;
642 start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
643 page = find_get_page(btree_inode->i_mapping, index);
644 if (!page)
645 continue;
646 if (PageDirty(page)) {
647 btree_lock_page_hook(page);
648 wait_on_page_writeback(page);
649 err = write_one_page(page, 0);
650 if (err)
651 werr = err;
652 }
653 wait_on_page_writeback(page);
654 page_cache_release(page);
655 cond_resched();
656 }
657 } 621 }
658 if (err) 622 if (err)
659 werr = err; 623 werr = err;
@@ -673,7 +637,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
673 637
674 ret = btrfs_write_marked_extents(root, dirty_pages, mark); 638 ret = btrfs_write_marked_extents(root, dirty_pages, mark);
675 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); 639 ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
676 return ret || ret2; 640
641 if (ret)
642 return ret;
643 if (ret2)
644 return ret2;
645 return 0;
677} 646}
678 647
679int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, 648int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -816,6 +785,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
816 785
817 btrfs_save_ino_cache(root, trans); 786 btrfs_save_ino_cache(root, trans);
818 787
788 /* see comments in should_cow_block() */
789 root->force_cow = 0;
790 smp_wmb();
791
819 if (root->commit_root != root->node) { 792 if (root->commit_root != root->node) {
820 mutex_lock(&root->fs_commit_mutex); 793 mutex_lock(&root->fs_commit_mutex);
821 switch_commit_root(root); 794 switch_commit_root(root);
@@ -911,11 +884,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
911 } 884 }
912 885
913 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); 886 btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
914 btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
915 887
916 if (to_reserve > 0) { 888 if (to_reserve > 0) {
917 ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv, 889 ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
918 to_reserve); 890 to_reserve);
919 if (ret) { 891 if (ret) {
920 pending->error = ret; 892 pending->error = ret;
921 goto fail; 893 goto fail;
@@ -979,6 +951,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
979 btrfs_tree_unlock(old); 951 btrfs_tree_unlock(old);
980 free_extent_buffer(old); 952 free_extent_buffer(old);
981 953
954 /* see comments in should_cow_block() */
955 root->force_cow = 1;
956 smp_wmb();
957
982 btrfs_set_root_node(new_root_item, tmp); 958 btrfs_set_root_node(new_root_item, tmp);
983 /* record when the snapshot was created in key.offset */ 959 /* record when the snapshot was created in key.offset */
984 key.offset = trans->transid; 960 key.offset = trans->transid;
@@ -1002,7 +978,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
1002 BUG_ON(IS_ERR(pending->snap)); 978 BUG_ON(IS_ERR(pending->snap));
1003 979
1004 btrfs_reloc_post_snapshot(trans, pending); 980 btrfs_reloc_post_snapshot(trans, pending);
1005 btrfs_orphan_post_snapshot(trans, pending);
1006fail: 981fail:
1007 kfree(new_root_item); 982 kfree(new_root_item);
1008 trans->block_rsv = rsv; 983 trans->block_rsv = rsv;
@@ -1032,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
1032 struct btrfs_root_item *root_item; 1007 struct btrfs_root_item *root_item;
1033 struct btrfs_super_block *super; 1008 struct btrfs_super_block *super;
1034 1009
1035 super = &root->fs_info->super_copy; 1010 super = root->fs_info->super_copy;
1036 1011
1037 root_item = &root->fs_info->chunk_root->root_item; 1012 root_item = &root->fs_info->chunk_root->root_item;
1038 super->chunk_root = root_item->bytenr; 1013 super->chunk_root = root_item->bytenr;
@@ -1043,7 +1018,7 @@ static void update_super_roots(struct btrfs_root *root)
1043 super->root = root_item->bytenr; 1018 super->root = root_item->bytenr;
1044 super->generation = root_item->generation; 1019 super->generation = root_item->generation;
1045 super->root_level = root_item->level; 1020 super->root_level = root_item->level;
1046 if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE)) 1021 if (btrfs_test_opt(root, SPACE_CACHE))
1047 super->cache_generation = root_item->generation; 1022 super->cache_generation = root_item->generation;
1048} 1023}
1049 1024
@@ -1168,14 +1143,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1168 1143
1169 btrfs_run_ordered_operations(root, 0); 1144 btrfs_run_ordered_operations(root, 0);
1170 1145
1146 btrfs_trans_release_metadata(trans, root);
1147 trans->block_rsv = NULL;
1148
1171 /* make a pass through all the delayed refs we have so far 1149 /* make a pass through all the delayed refs we have so far
1172 * any runnings procs may add more while we are here 1150 * any runnings procs may add more while we are here
1173 */ 1151 */
1174 ret = btrfs_run_delayed_refs(trans, root, 0); 1152 ret = btrfs_run_delayed_refs(trans, root, 0);
1175 BUG_ON(ret); 1153 BUG_ON(ret);
1176 1154
1177 btrfs_trans_release_metadata(trans, root);
1178
1179 cur_trans = trans->transaction; 1155 cur_trans = trans->transaction;
1180 /* 1156 /*
1181 * set the flushing flag so procs in this transaction have to 1157 * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1317,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
1341 update_super_roots(root); 1317 update_super_roots(root);
1342 1318
1343 if (!root->fs_info->log_root_recovering) { 1319 if (!root->fs_info->log_root_recovering) {
1344 btrfs_set_super_log_root(&root->fs_info->super_copy, 0); 1320 btrfs_set_super_log_root(root->fs_info->super_copy, 0);
1345 btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); 1321 btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
1346 } 1322 }
1347 1323
1348 memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, 1324 memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
1349 sizeof(root->fs_info->super_copy)); 1325 sizeof(*root->fs_info->super_copy));
1350 1326
1351 trans->transaction->blocked = 0; 1327 trans->transaction->blocked = 0;
1352 spin_lock(&root->fs_info->trans_lock); 1328 spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..3568374d419d 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
276 struct walk_control *wc, u64 gen) 276 struct walk_control *wc, u64 gen)
277{ 277{
278 if (wc->pin) 278 if (wc->pin)
279 btrfs_pin_extent(log->fs_info->extent_root, 279 btrfs_pin_extent_for_log_replay(wc->trans,
280 eb->start, eb->len, 0); 280 log->fs_info->extent_root,
281 eb->start, eb->len);
281 282
282 if (btrfs_buffer_uptodate(eb, gen)) { 283 if (btrfs_buffer_uptodate(eb, gen)) {
283 if (wc->write) 284 if (wc->write)
@@ -1030,7 +1031,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1030 } 1031 }
1031 btrfs_release_path(path); 1032 btrfs_release_path(path);
1032 if (nlink != inode->i_nlink) { 1033 if (nlink != inode->i_nlink) {
1033 inode->i_nlink = nlink; 1034 set_nlink(inode, nlink);
1034 btrfs_update_inode(trans, root, inode); 1035 btrfs_update_inode(trans, root, inode);
1035 } 1036 }
1036 BTRFS_I(inode)->index_cnt = (u64)-1; 1037 BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
1760 1761
1761 WARN_ON(root_owner != 1762 WARN_ON(root_owner !=
1762 BTRFS_TREE_LOG_OBJECTID); 1763 BTRFS_TREE_LOG_OBJECTID);
1763 ret = btrfs_free_reserved_extent(root, 1764 ret = btrfs_free_and_pin_reserved_extent(root,
1764 bytenr, blocksize); 1765 bytenr, blocksize);
1765 BUG_ON(ret); 1766 BUG_ON(ret);
1766 } 1767 }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
1828 btrfs_tree_unlock(next); 1829 btrfs_tree_unlock(next);
1829 1830
1830 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); 1831 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1831 ret = btrfs_free_reserved_extent(root, 1832 ret = btrfs_free_and_pin_reserved_extent(root,
1832 path->nodes[*level]->start, 1833 path->nodes[*level]->start,
1833 path->nodes[*level]->len); 1834 path->nodes[*level]->len);
1834 BUG_ON(ret); 1835 BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
1897 1898
1898 WARN_ON(log->root_key.objectid != 1899 WARN_ON(log->root_key.objectid !=
1899 BTRFS_TREE_LOG_OBJECTID); 1900 BTRFS_TREE_LOG_OBJECTID);
1900 ret = btrfs_free_reserved_extent(log, next->start, 1901 ret = btrfs_free_and_pin_reserved_extent(log, next->start,
1901 next->len); 1902 next->len);
1902 BUG_ON(ret); 1903 BUG_ON(ret);
1903 } 1904 }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2013 /* wait for previous tree log sync to complete */ 2014 /* wait for previous tree log sync to complete */
2014 if (atomic_read(&root->log_commit[(index1 + 1) % 2])) 2015 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
2015 wait_log_commit(trans, root, root->log_transid - 1); 2016 wait_log_commit(trans, root, root->log_transid - 1);
2016
2017 while (1) { 2017 while (1) {
2018 unsigned long batch = root->log_batch; 2018 unsigned long batch = root->log_batch;
2019 if (root->log_multiple_pids) { 2019 /* when we're on an ssd, just kick the log commit out */
2020 if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
2020 mutex_unlock(&root->log_mutex); 2021 mutex_unlock(&root->log_mutex);
2021 schedule_timeout_uninterruptible(1); 2022 schedule_timeout_uninterruptible(1);
2022 mutex_lock(&root->log_mutex); 2023 mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
2117 BUG_ON(ret); 2118 BUG_ON(ret);
2118 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); 2119 btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
2119 2120
2120 btrfs_set_super_log_root(&root->fs_info->super_for_commit, 2121 btrfs_set_super_log_root(root->fs_info->super_for_commit,
2121 log_root_tree->node->start); 2122 log_root_tree->node->start);
2122 btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, 2123 btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
2123 btrfs_header_level(log_root_tree->node)); 2124 btrfs_header_level(log_root_tree->node));
2124 2125
2125 log_root_tree->log_batch = 0; 2126 log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..c37433d3cd82 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
366 } 366 }
367 INIT_LIST_HEAD(&device->dev_alloc_list); 367 INIT_LIST_HEAD(&device->dev_alloc_list);
368 368
369 /* init readahead state */
370 spin_lock_init(&device->reada_lock);
371 device->reada_curr_zone = NULL;
372 atomic_set(&device->reada_in_flight, 0);
373 device->reada_next = 0;
374 INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
375 INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
376
369 mutex_lock(&fs_devices->device_list_mutex); 377 mutex_lock(&fs_devices->device_list_mutex);
370 list_add_rcu(&device->dev_list, &fs_devices->devices); 378 list_add_rcu(&device->dev_list, &fs_devices->devices);
371 mutex_unlock(&fs_devices->device_list_mutex); 379 mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
597 set_blocksize(bdev, 4096); 605 set_blocksize(bdev, 4096);
598 606
599 bh = btrfs_read_dev_super(bdev); 607 bh = btrfs_read_dev_super(bdev);
600 if (!bh) { 608 if (!bh)
601 ret = -EINVAL;
602 goto error_close; 609 goto error_close;
603 }
604 610
605 disk_super = (struct btrfs_super_block *)bh->b_data; 611 disk_super = (struct btrfs_super_block *)bh->b_data;
606 devid = btrfs_stack_device_id(&disk_super->dev_item); 612 devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
655 continue; 661 continue;
656 } 662 }
657 if (fs_devices->open_devices == 0) { 663 if (fs_devices->open_devices == 0) {
658 ret = -EIO; 664 ret = -EINVAL;
659 goto out; 665 goto out;
660 } 666 }
661 fs_devices->seeding = seeding; 667 fs_devices->seeding = seeding;
@@ -993,7 +999,7 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
993 key.objectid = device->devid; 999 key.objectid = device->devid;
994 key.offset = start; 1000 key.offset = start;
995 key.type = BTRFS_DEV_EXTENT_KEY; 1001 key.type = BTRFS_DEV_EXTENT_KEY;
996 1002again:
997 ret = btrfs_search_slot(trans, root, &key, path, -1, 1); 1003 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
998 if (ret > 0) { 1004 if (ret > 0) {
999 ret = btrfs_previous_item(root, path, key.objectid, 1005 ret = btrfs_previous_item(root, path, key.objectid,
@@ -1006,6 +1012,9 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1006 struct btrfs_dev_extent); 1012 struct btrfs_dev_extent);
1007 BUG_ON(found_key.offset > start || found_key.offset + 1013 BUG_ON(found_key.offset > start || found_key.offset +
1008 btrfs_dev_extent_length(leaf, extent) < start); 1014 btrfs_dev_extent_length(leaf, extent) < start);
1015 key = found_key;
1016 btrfs_release_path(path);
1017 goto again;
1009 } else if (ret == 0) { 1018 } else if (ret == 0) {
1010 leaf = path->nodes[0]; 1019 leaf = path->nodes[0];
1011 extent = btrfs_item_ptr(leaf, path->slots[0], 1020 extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1013,8 +1022,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1013 } 1022 }
1014 BUG_ON(ret); 1023 BUG_ON(ret);
1015 1024
1016 if (device->bytes_used > 0) 1025 if (device->bytes_used > 0) {
1017 device->bytes_used -= btrfs_dev_extent_length(leaf, extent); 1026 u64 len = btrfs_dev_extent_length(leaf, extent);
1027 device->bytes_used -= len;
1028 spin_lock(&root->fs_info->free_chunk_lock);
1029 root->fs_info->free_chunk_space += len;
1030 spin_unlock(&root->fs_info->free_chunk_lock);
1031 }
1018 ret = btrfs_del_item(trans, root, path); 1032 ret = btrfs_del_item(trans, root, path);
1019 1033
1020out: 1034out:
@@ -1356,6 +1370,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1356 if (ret) 1370 if (ret)
1357 goto error_undo; 1371 goto error_undo;
1358 1372
1373 spin_lock(&root->fs_info->free_chunk_lock);
1374 root->fs_info->free_chunk_space = device->total_bytes -
1375 device->bytes_used;
1376 spin_unlock(&root->fs_info->free_chunk_lock);
1377
1359 device->in_fs_metadata = 0; 1378 device->in_fs_metadata = 0;
1360 btrfs_scrub_cancel_dev(root, device); 1379 btrfs_scrub_cancel_dev(root, device);
1361 1380
@@ -1387,8 +1406,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1387 call_rcu(&device->rcu, free_device); 1406 call_rcu(&device->rcu, free_device);
1388 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1407 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1389 1408
1390 num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; 1409 num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1391 btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); 1410 btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1392 1411
1393 if (cur_devices->open_devices == 0) { 1412 if (cur_devices->open_devices == 0) {
1394 struct btrfs_fs_devices *fs_devices; 1413 struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1469,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
1450 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; 1469 struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1451 struct btrfs_fs_devices *old_devices; 1470 struct btrfs_fs_devices *old_devices;
1452 struct btrfs_fs_devices *seed_devices; 1471 struct btrfs_fs_devices *seed_devices;
1453 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 1472 struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1454 struct btrfs_device *device; 1473 struct btrfs_device *device;
1455 u64 super_flags; 1474 u64 super_flags;
1456 1475
@@ -1691,15 +1710,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
1691 root->fs_info->fs_devices->num_can_discard++; 1710 root->fs_info->fs_devices->num_can_discard++;
1692 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; 1711 root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
1693 1712
1713 spin_lock(&root->fs_info->free_chunk_lock);
1714 root->fs_info->free_chunk_space += device->total_bytes;
1715 spin_unlock(&root->fs_info->free_chunk_lock);
1716
1694 if (!blk_queue_nonrot(bdev_get_queue(bdev))) 1717 if (!blk_queue_nonrot(bdev_get_queue(bdev)))
1695 root->fs_info->fs_devices->rotating = 1; 1718 root->fs_info->fs_devices->rotating = 1;
1696 1719
1697 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); 1720 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
1698 btrfs_set_super_total_bytes(&root->fs_info->super_copy, 1721 btrfs_set_super_total_bytes(root->fs_info->super_copy,
1699 total_bytes + device->total_bytes); 1722 total_bytes + device->total_bytes);
1700 1723
1701 total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); 1724 total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
1702 btrfs_set_super_num_devices(&root->fs_info->super_copy, 1725 btrfs_set_super_num_devices(root->fs_info->super_copy,
1703 total_bytes + 1); 1726 total_bytes + 1);
1704 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); 1727 mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1705 1728
@@ -1790,7 +1813,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
1790 struct btrfs_device *device, u64 new_size) 1813 struct btrfs_device *device, u64 new_size)
1791{ 1814{
1792 struct btrfs_super_block *super_copy = 1815 struct btrfs_super_block *super_copy =
1793 &device->dev_root->fs_info->super_copy; 1816 device->dev_root->fs_info->super_copy;
1794 u64 old_total = btrfs_super_total_bytes(super_copy); 1817 u64 old_total = btrfs_super_total_bytes(super_copy);
1795 u64 diff = new_size - device->total_bytes; 1818 u64 diff = new_size - device->total_bytes;
1796 1819
@@ -1849,7 +1872,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
1849static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 1872static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
1850 chunk_offset) 1873 chunk_offset)
1851{ 1874{
1852 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 1875 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
1853 struct btrfs_disk_key *disk_key; 1876 struct btrfs_disk_key *disk_key;
1854 struct btrfs_chunk *chunk; 1877 struct btrfs_chunk *chunk;
1855 u8 *ptr; 1878 u8 *ptr;
@@ -2175,7 +2198,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2175 bool retried = false; 2198 bool retried = false;
2176 struct extent_buffer *l; 2199 struct extent_buffer *l;
2177 struct btrfs_key key; 2200 struct btrfs_key key;
2178 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2201 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2179 u64 old_total = btrfs_super_total_bytes(super_copy); 2202 u64 old_total = btrfs_super_total_bytes(super_copy);
2180 u64 old_size = device->total_bytes; 2203 u64 old_size = device->total_bytes;
2181 u64 diff = device->total_bytes - new_size; 2204 u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2215,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
2192 lock_chunks(root); 2215 lock_chunks(root);
2193 2216
2194 device->total_bytes = new_size; 2217 device->total_bytes = new_size;
2195 if (device->writeable) 2218 if (device->writeable) {
2196 device->fs_devices->total_rw_bytes -= diff; 2219 device->fs_devices->total_rw_bytes -= diff;
2220 spin_lock(&root->fs_info->free_chunk_lock);
2221 root->fs_info->free_chunk_space -= diff;
2222 spin_unlock(&root->fs_info->free_chunk_lock);
2223 }
2197 unlock_chunks(root); 2224 unlock_chunks(root);
2198 2225
2199again: 2226again:
@@ -2257,6 +2284,9 @@ again:
2257 device->total_bytes = old_size; 2284 device->total_bytes = old_size;
2258 if (device->writeable) 2285 if (device->writeable)
2259 device->fs_devices->total_rw_bytes += diff; 2286 device->fs_devices->total_rw_bytes += diff;
2287 spin_lock(&root->fs_info->free_chunk_lock);
2288 root->fs_info->free_chunk_space += diff;
2289 spin_unlock(&root->fs_info->free_chunk_lock);
2260 unlock_chunks(root); 2290 unlock_chunks(root);
2261 goto done; 2291 goto done;
2262 } 2292 }
@@ -2292,7 +2322,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
2292 struct btrfs_key *key, 2322 struct btrfs_key *key,
2293 struct btrfs_chunk *chunk, int item_size) 2323 struct btrfs_chunk *chunk, int item_size)
2294{ 2324{
2295 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 2325 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2296 struct btrfs_disk_key disk_key; 2326 struct btrfs_disk_key disk_key;
2297 u32 array_size; 2327 u32 array_size;
2298 u8 *ptr; 2328 u8 *ptr;
@@ -2615,6 +2645,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
2615 index++; 2645 index++;
2616 } 2646 }
2617 2647
2648 spin_lock(&extent_root->fs_info->free_chunk_lock);
2649 extent_root->fs_info->free_chunk_space -= (stripe_size *
2650 map->num_stripes);
2651 spin_unlock(&extent_root->fs_info->free_chunk_lock);
2652
2618 index = 0; 2653 index = 0;
2619 stripe = &chunk->stripe; 2654 stripe = &chunk->stripe;
2620 while (index < map->num_stripes) { 2655 while (index < map->num_stripes) {
@@ -2848,7 +2883,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2848 2883
2849static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2884static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2850 u64 logical, u64 *length, 2885 u64 logical, u64 *length,
2851 struct btrfs_multi_bio **multi_ret, 2886 struct btrfs_bio **bbio_ret,
2852 int mirror_num) 2887 int mirror_num)
2853{ 2888{
2854 struct extent_map *em; 2889 struct extent_map *em;
@@ -2866,18 +2901,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2866 int i; 2901 int i;
2867 int num_stripes; 2902 int num_stripes;
2868 int max_errors = 0; 2903 int max_errors = 0;
2869 struct btrfs_multi_bio *multi = NULL; 2904 struct btrfs_bio *bbio = NULL;
2870 2905
2871 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2906 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2872 stripes_allocated = 1; 2907 stripes_allocated = 1;
2873again: 2908again:
2874 if (multi_ret) { 2909 if (bbio_ret) {
2875 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2910 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2876 GFP_NOFS); 2911 GFP_NOFS);
2877 if (!multi) 2912 if (!bbio)
2878 return -ENOMEM; 2913 return -ENOMEM;
2879 2914
2880 atomic_set(&multi->error, 0); 2915 atomic_set(&bbio->error, 0);
2881 } 2916 }
2882 2917
2883 read_lock(&em_tree->lock); 2918 read_lock(&em_tree->lock);
@@ -2898,7 +2933,7 @@ again:
2898 if (mirror_num > map->num_stripes) 2933 if (mirror_num > map->num_stripes)
2899 mirror_num = 0; 2934 mirror_num = 0;
2900 2935
2901 /* if our multi bio struct is too small, back off and try again */ 2936 /* if our btrfs_bio struct is too small, back off and try again */
2902 if (rw & REQ_WRITE) { 2937 if (rw & REQ_WRITE) {
2903 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2938 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2904 BTRFS_BLOCK_GROUP_DUP)) { 2939 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2952,11 @@ again:
2917 stripes_required = map->num_stripes; 2952 stripes_required = map->num_stripes;
2918 } 2953 }
2919 } 2954 }
2920 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2955 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2921 stripes_allocated < stripes_required) { 2956 stripes_allocated < stripes_required) {
2922 stripes_allocated = map->num_stripes; 2957 stripes_allocated = map->num_stripes;
2923 free_extent_map(em); 2958 free_extent_map(em);
2924 kfree(multi); 2959 kfree(bbio);
2925 goto again; 2960 goto again;
2926 } 2961 }
2927 stripe_nr = offset; 2962 stripe_nr = offset;
@@ -2950,7 +2985,7 @@ again:
2950 *length = em->len - offset; 2985 *length = em->len - offset;
2951 } 2986 }
2952 2987
2953 if (!multi_ret) 2988 if (!bbio_ret)
2954 goto out; 2989 goto out;
2955 2990
2956 num_stripes = 1; 2991 num_stripes = 1;
@@ -2975,13 +3010,17 @@ again:
2975 stripe_index = find_live_mirror(map, 0, 3010 stripe_index = find_live_mirror(map, 0,
2976 map->num_stripes, 3011 map->num_stripes,
2977 current->pid % map->num_stripes); 3012 current->pid % map->num_stripes);
3013 mirror_num = stripe_index + 1;
2978 } 3014 }
2979 3015
2980 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3016 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2981 if (rw & (REQ_WRITE | REQ_DISCARD)) 3017 if (rw & (REQ_WRITE | REQ_DISCARD)) {
2982 num_stripes = map->num_stripes; 3018 num_stripes = map->num_stripes;
2983 else if (mirror_num) 3019 } else if (mirror_num) {
2984 stripe_index = mirror_num - 1; 3020 stripe_index = mirror_num - 1;
3021 } else {
3022 mirror_num = 1;
3023 }
2985 3024
2986 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3025 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2987 int factor = map->num_stripes / map->sub_stripes; 3026 int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3040,7 @@ again:
3001 stripe_index = find_live_mirror(map, stripe_index, 3040 stripe_index = find_live_mirror(map, stripe_index,
3002 map->sub_stripes, stripe_index + 3041 map->sub_stripes, stripe_index +
3003 current->pid % map->sub_stripes); 3042 current->pid % map->sub_stripes);
3043 mirror_num = stripe_index + 1;
3004 } 3044 }
3005 } else { 3045 } else {
3006 /* 3046 /*
@@ -3009,15 +3049,16 @@ again:
3009 * stripe_index is the number of our device in the stripe array 3049 * stripe_index is the number of our device in the stripe array
3010 */ 3050 */
3011 stripe_index = do_div(stripe_nr, map->num_stripes); 3051 stripe_index = do_div(stripe_nr, map->num_stripes);
3052 mirror_num = stripe_index + 1;
3012 } 3053 }
3013 BUG_ON(stripe_index >= map->num_stripes); 3054 BUG_ON(stripe_index >= map->num_stripes);
3014 3055
3015 if (rw & REQ_DISCARD) { 3056 if (rw & REQ_DISCARD) {
3016 for (i = 0; i < num_stripes; i++) { 3057 for (i = 0; i < num_stripes; i++) {
3017 multi->stripes[i].physical = 3058 bbio->stripes[i].physical =
3018 map->stripes[stripe_index].physical + 3059 map->stripes[stripe_index].physical +
3019 stripe_offset + stripe_nr * map->stripe_len; 3060 stripe_offset + stripe_nr * map->stripe_len;
3020 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3061 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3021 3062
3022 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3063 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3023 u64 stripes; 3064 u64 stripes;
@@ -3038,16 +3079,16 @@ again:
3038 } 3079 }
3039 stripes = stripe_nr_end - 1 - j; 3080 stripes = stripe_nr_end - 1 - j;
3040 do_div(stripes, map->num_stripes); 3081 do_div(stripes, map->num_stripes);
3041 multi->stripes[i].length = map->stripe_len * 3082 bbio->stripes[i].length = map->stripe_len *
3042 (stripes - stripe_nr + 1); 3083 (stripes - stripe_nr + 1);
3043 3084
3044 if (i == 0) { 3085 if (i == 0) {
3045 multi->stripes[i].length -= 3086 bbio->stripes[i].length -=
3046 stripe_offset; 3087 stripe_offset;
3047 stripe_offset = 0; 3088 stripe_offset = 0;
3048 } 3089 }
3049 if (stripe_index == last_stripe) 3090 if (stripe_index == last_stripe)
3050 multi->stripes[i].length -= 3091 bbio->stripes[i].length -=
3051 stripe_end_offset; 3092 stripe_end_offset;
3052 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3093 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3053 u64 stripes; 3094 u64 stripes;
@@ -3072,11 +3113,11 @@ again:
3072 } 3113 }
3073 stripes = stripe_nr_end - 1 - j; 3114 stripes = stripe_nr_end - 1 - j;
3074 do_div(stripes, factor); 3115 do_div(stripes, factor);
3075 multi->stripes[i].length = map->stripe_len * 3116 bbio->stripes[i].length = map->stripe_len *
3076 (stripes - stripe_nr + 1); 3117 (stripes - stripe_nr + 1);
3077 3118
3078 if (i < map->sub_stripes) { 3119 if (i < map->sub_stripes) {
3079 multi->stripes[i].length -= 3120 bbio->stripes[i].length -=
3080 stripe_offset; 3121 stripe_offset;
3081 if (i == map->sub_stripes - 1) 3122 if (i == map->sub_stripes - 1)
3082 stripe_offset = 0; 3123 stripe_offset = 0;
@@ -3084,11 +3125,11 @@ again:
3084 if (stripe_index >= last_stripe && 3125 if (stripe_index >= last_stripe &&
3085 stripe_index <= (last_stripe + 3126 stripe_index <= (last_stripe +
3086 map->sub_stripes - 1)) { 3127 map->sub_stripes - 1)) {
3087 multi->stripes[i].length -= 3128 bbio->stripes[i].length -=
3088 stripe_end_offset; 3129 stripe_end_offset;
3089 } 3130 }
3090 } else 3131 } else
3091 multi->stripes[i].length = *length; 3132 bbio->stripes[i].length = *length;
3092 3133
3093 stripe_index++; 3134 stripe_index++;
3094 if (stripe_index == map->num_stripes) { 3135 if (stripe_index == map->num_stripes) {
@@ -3099,19 +3140,20 @@ again:
3099 } 3140 }
3100 } else { 3141 } else {
3101 for (i = 0; i < num_stripes; i++) { 3142 for (i = 0; i < num_stripes; i++) {
3102 multi->stripes[i].physical = 3143 bbio->stripes[i].physical =
3103 map->stripes[stripe_index].physical + 3144 map->stripes[stripe_index].physical +
3104 stripe_offset + 3145 stripe_offset +
3105 stripe_nr * map->stripe_len; 3146 stripe_nr * map->stripe_len;
3106 multi->stripes[i].dev = 3147 bbio->stripes[i].dev =
3107 map->stripes[stripe_index].dev; 3148 map->stripes[stripe_index].dev;
3108 stripe_index++; 3149 stripe_index++;
3109 } 3150 }
3110 } 3151 }
3111 if (multi_ret) { 3152 if (bbio_ret) {
3112 *multi_ret = multi; 3153 *bbio_ret = bbio;
3113 multi->num_stripes = num_stripes; 3154 bbio->num_stripes = num_stripes;
3114 multi->max_errors = max_errors; 3155 bbio->max_errors = max_errors;
3156 bbio->mirror_num = mirror_num;
3115 } 3157 }
3116out: 3158out:
3117 free_extent_map(em); 3159 free_extent_map(em);
@@ -3120,9 +3162,9 @@ out:
3120 3162
3121int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3163int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3122 u64 logical, u64 *length, 3164 u64 logical, u64 *length,
3123 struct btrfs_multi_bio **multi_ret, int mirror_num) 3165 struct btrfs_bio **bbio_ret, int mirror_num)
3124{ 3166{
3125 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3167 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3126 mirror_num); 3168 mirror_num);
3127} 3169}
3128 3170
@@ -3191,28 +3233,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3191 return 0; 3233 return 0;
3192} 3234}
3193 3235
3194static void end_bio_multi_stripe(struct bio *bio, int err) 3236static void btrfs_end_bio(struct bio *bio, int err)
3195{ 3237{
3196 struct btrfs_multi_bio *multi = bio->bi_private; 3238 struct btrfs_bio *bbio = bio->bi_private;
3197 int is_orig_bio = 0; 3239 int is_orig_bio = 0;
3198 3240
3199 if (err) 3241 if (err)
3200 atomic_inc(&multi->error); 3242 atomic_inc(&bbio->error);
3201 3243
3202 if (bio == multi->orig_bio) 3244 if (bio == bbio->orig_bio)
3203 is_orig_bio = 1; 3245 is_orig_bio = 1;
3204 3246
3205 if (atomic_dec_and_test(&multi->stripes_pending)) { 3247 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3206 if (!is_orig_bio) { 3248 if (!is_orig_bio) {
3207 bio_put(bio); 3249 bio_put(bio);
3208 bio = multi->orig_bio; 3250 bio = bbio->orig_bio;
3209 } 3251 }
3210 bio->bi_private = multi->private; 3252 bio->bi_private = bbio->private;
3211 bio->bi_end_io = multi->end_io; 3253 bio->bi_end_io = bbio->end_io;
3254 bio->bi_bdev = (struct block_device *)
3255 (unsigned long)bbio->mirror_num;
3212 /* only send an error to the higher layers if it is 3256 /* only send an error to the higher layers if it is
3213 * beyond the tolerance of the multi-bio 3257 * beyond the tolerance of the multi-bio
3214 */ 3258 */
3215 if (atomic_read(&multi->error) > multi->max_errors) { 3259 if (atomic_read(&bbio->error) > bbio->max_errors) {
3216 err = -EIO; 3260 err = -EIO;
3217 } else if (err) { 3261 } else if (err) {
3218 /* 3262 /*
@@ -3222,7 +3266,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3222 set_bit(BIO_UPTODATE, &bio->bi_flags); 3266 set_bit(BIO_UPTODATE, &bio->bi_flags);
3223 err = 0; 3267 err = 0;
3224 } 3268 }
3225 kfree(multi); 3269 kfree(bbio);
3226 3270
3227 bio_endio(bio, err); 3271 bio_endio(bio, err);
3228 } else if (!is_orig_bio) { 3272 } else if (!is_orig_bio) {
@@ -3302,20 +3346,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3302 u64 logical = (u64)bio->bi_sector << 9; 3346 u64 logical = (u64)bio->bi_sector << 9;
3303 u64 length = 0; 3347 u64 length = 0;
3304 u64 map_length; 3348 u64 map_length;
3305 struct btrfs_multi_bio *multi = NULL;
3306 int ret; 3349 int ret;
3307 int dev_nr = 0; 3350 int dev_nr = 0;
3308 int total_devs = 1; 3351 int total_devs = 1;
3352 struct btrfs_bio *bbio = NULL;
3309 3353
3310 length = bio->bi_size; 3354 length = bio->bi_size;
3311 map_tree = &root->fs_info->mapping_tree; 3355 map_tree = &root->fs_info->mapping_tree;
3312 map_length = length; 3356 map_length = length;
3313 3357
3314 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3358 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3315 mirror_num); 3359 mirror_num);
3316 BUG_ON(ret); 3360 BUG_ON(ret);
3317 3361
3318 total_devs = multi->num_stripes; 3362 total_devs = bbio->num_stripes;
3319 if (map_length < length) { 3363 if (map_length < length) {
3320 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3364 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3321 "len %llu\n", (unsigned long long)logical, 3365 "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3367,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3323 (unsigned long long)map_length); 3367 (unsigned long long)map_length);
3324 BUG(); 3368 BUG();
3325 } 3369 }
3326 multi->end_io = first_bio->bi_end_io; 3370
3327 multi->private = first_bio->bi_private; 3371 bbio->orig_bio = first_bio;
3328 multi->orig_bio = first_bio; 3372 bbio->private = first_bio->bi_private;
3329 atomic_set(&multi->stripes_pending, multi->num_stripes); 3373 bbio->end_io = first_bio->bi_end_io;
3374 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3330 3375
3331 while (dev_nr < total_devs) { 3376 while (dev_nr < total_devs) {
3332 if (total_devs > 1) { 3377 if (dev_nr < total_devs - 1) {
3333 if (dev_nr < total_devs - 1) { 3378 bio = bio_clone(first_bio, GFP_NOFS);
3334 bio = bio_clone(first_bio, GFP_NOFS); 3379 BUG_ON(!bio);
3335 BUG_ON(!bio); 3380 } else {
3336 } else { 3381 bio = first_bio;
3337 bio = first_bio;
3338 }
3339 bio->bi_private = multi;
3340 bio->bi_end_io = end_bio_multi_stripe;
3341 } 3382 }
3342 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3383 bio->bi_private = bbio;
3343 dev = multi->stripes[dev_nr].dev; 3384 bio->bi_end_io = btrfs_end_bio;
3385 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3386 dev = bbio->stripes[dev_nr].dev;
3344 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3387 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3388 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3389 "(%s id %llu), size=%u\n", rw,
3390 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3391 dev->name, dev->devid, bio->bi_size);
3345 bio->bi_bdev = dev->bdev; 3392 bio->bi_bdev = dev->bdev;
3346 if (async_submit) 3393 if (async_submit)
3347 schedule_bio(root, dev, rw, bio); 3394 schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3401,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3354 } 3401 }
3355 dev_nr++; 3402 dev_nr++;
3356 } 3403 }
3357 if (total_devs == 1)
3358 kfree(multi);
3359 return 0; 3404 return 0;
3360} 3405}
3361 3406
@@ -3616,15 +3661,20 @@ static int read_one_dev(struct btrfs_root *root,
3616 fill_device_from_item(leaf, dev_item, device); 3661 fill_device_from_item(leaf, dev_item, device);
3617 device->dev_root = root->fs_info->dev_root; 3662 device->dev_root = root->fs_info->dev_root;
3618 device->in_fs_metadata = 1; 3663 device->in_fs_metadata = 1;
3619 if (device->writeable) 3664 if (device->writeable) {
3620 device->fs_devices->total_rw_bytes += device->total_bytes; 3665 device->fs_devices->total_rw_bytes += device->total_bytes;
3666 spin_lock(&root->fs_info->free_chunk_lock);
3667 root->fs_info->free_chunk_space += device->total_bytes -
3668 device->bytes_used;
3669 spin_unlock(&root->fs_info->free_chunk_lock);
3670 }
3621 ret = 0; 3671 ret = 0;
3622 return ret; 3672 return ret;
3623} 3673}
3624 3674
3625int btrfs_read_sys_array(struct btrfs_root *root) 3675int btrfs_read_sys_array(struct btrfs_root *root)
3626{ 3676{
3627 struct btrfs_super_block *super_copy = &root->fs_info->super_copy; 3677 struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3628 struct extent_buffer *sb; 3678 struct extent_buffer *sb;
3629 struct btrfs_disk_key *disk_key; 3679 struct btrfs_disk_key *disk_key;
3630 struct btrfs_chunk *chunk; 3680 struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..78f2d4d4f37f 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,20 @@ struct btrfs_device {
92 struct btrfs_work work; 92 struct btrfs_work work;
93 struct rcu_head rcu; 93 struct rcu_head rcu;
94 struct work_struct rcu_work; 94 struct work_struct rcu_work;
95
96 /* readahead state */
97 spinlock_t reada_lock;
98 atomic_t reada_in_flight;
99 u64 reada_next;
100 struct reada_zone *reada_curr_zone;
101 struct radix_tree_root reada_zones;
102 struct radix_tree_root reada_extents;
103
104 /* for sending down flush barriers */
105 struct bio *flush_bio;
106 struct completion flush_wait;
107 int nobarriers;
108
95}; 109};
96 110
97struct btrfs_fs_devices { 111struct btrfs_fs_devices {
@@ -136,7 +150,10 @@ struct btrfs_bio_stripe {
136 u64 length; /* only used for discard mappings */ 150 u64 length; /* only used for discard mappings */
137}; 151};
138 152
139struct btrfs_multi_bio { 153struct btrfs_bio;
154typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
155
156struct btrfs_bio {
140 atomic_t stripes_pending; 157 atomic_t stripes_pending;
141 bio_end_io_t *end_io; 158 bio_end_io_t *end_io;
142 struct bio *orig_bio; 159 struct bio *orig_bio;
@@ -144,6 +161,7 @@ struct btrfs_multi_bio {
144 atomic_t error; 161 atomic_t error;
145 int max_errors; 162 int max_errors;
146 int num_stripes; 163 int num_stripes;
164 int mirror_num;
147 struct btrfs_bio_stripe stripes[]; 165 struct btrfs_bio_stripe stripes[];
148}; 166};
149 167
@@ -171,7 +189,7 @@ struct map_lookup {
171int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 189int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
172 u64 end, u64 *length); 190 u64 end, u64 *length);
173 191
174#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 192#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
175 (sizeof(struct btrfs_bio_stripe) * (n))) 193 (sizeof(struct btrfs_bio_stripe) * (n)))
176 194
177int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 195int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +198,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
180 u64 chunk_offset, u64 start, u64 num_bytes); 198 u64 chunk_offset, u64 start, u64 num_bytes);
181int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 199int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
182 u64 logical, u64 *length, 200 u64 logical, u64 *length,
183 struct btrfs_multi_bio **multi_ret, int mirror_num); 201 struct btrfs_bio **bbio_ret, int mirror_num);
184int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 202int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
185 u64 chunk_start, u64 physical, u64 devid, 203 u64 chunk_start, u64 physical, u64 devid,
186 u64 **logical, int *naddrs, int *stripe_len); 204 u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 426aa464f1af..3848b04e310e 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
127again: 127again:
128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode), 128 ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
129 name, name_len, value, size); 129 name, name_len, value, size);
130 /*
131 * If we're setting an xattr to a new value but the new value is say
132 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
133 * back from split_leaf. This is because it thinks we'll be extending
134 * the existing item size, but we're asking for enough space to add the
135 * item itself. So if we get EOVERFLOW just set ret to EEXIST and let
136 * the rest of the function figure it out.
137 */
138 if (ret == -EOVERFLOW)
139 ret = -EEXIST;
140
130 if (ret == -EEXIST) { 141 if (ret == -EEXIST) {
131 if (flags & XATTR_CREATE) 142 if (flags & XATTR_CREATE)
132 goto out; 143 goto out;