aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/backref.c776
-rw-r--r--fs/btrfs/backref.h62
-rw-r--r--fs/btrfs/disk-io.c4
-rw-r--r--fs/btrfs/extent-tree.c10
-rw-r--r--fs/btrfs/extent_io.c393
-rw-r--r--fs/btrfs/extent_io.h13
-rw-r--r--fs/btrfs/inode.c157
-rw-r--r--fs/btrfs/ioctl.c143
-rw-r--r--fs/btrfs/ioctl.h30
-rw-r--r--fs/btrfs/reada.c4
-rw-r--r--fs/btrfs/scrub.c476
-rw-r--r--fs/btrfs/volumes.c130
-rw-r--r--fs/btrfs/volumes.h10
14 files changed, 1930 insertions, 280 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index bdd6fb238ce1..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,6 +8,6 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ 8 extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \ 9 export.o tree-log.o free-space-cache.o zlib.o lzo.o \
10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ 10 compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
11 reada.o 11 reada.o backref.o
12 12
13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o 13btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..2351df0de450
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#include "ctree.h"
20#include "disk-io.h"
21#include "backref.h"
22
23struct __data_ref {
24 struct list_head list;
25 u64 inum;
26 u64 root;
27 u64 extent_data_item_offset;
28};
29
30struct __shared_ref {
31 struct list_head list;
32 u64 disk_byte;
33};
34
35static int __inode_info(u64 inum, u64 ioff, u8 key_type,
36 struct btrfs_root *fs_root, struct btrfs_path *path,
37 struct btrfs_key *found_key)
38{
39 int ret;
40 struct btrfs_key key;
41 struct extent_buffer *eb;
42
43 key.type = key_type;
44 key.objectid = inum;
45 key.offset = ioff;
46
47 ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
48 if (ret < 0)
49 return ret;
50
51 eb = path->nodes[0];
52 if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
53 ret = btrfs_next_leaf(fs_root, path);
54 if (ret)
55 return ret;
56 eb = path->nodes[0];
57 }
58
59 btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
60 if (found_key->type != key.type || found_key->objectid != key.objectid)
61 return 1;
62
63 return 0;
64}
65
66/*
67 * this makes the path point to (inum INODE_ITEM ioff)
68 */
69int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
70 struct btrfs_path *path)
71{
72 struct btrfs_key key;
73 return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
74 &key);
75}
76
77static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
78 struct btrfs_path *path,
79 struct btrfs_key *found_key)
80{
81 return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
82 found_key);
83}
84
85/*
86 * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
87 * of the path are separated by '/' and the path is guaranteed to be
88 * 0-terminated. the path is only given within the current file system.
89 * Therefore, it never starts with a '/'. the caller is responsible to provide
90 * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
91 * the start point of the resulting string is returned. this pointer is within
92 * dest, normally.
93 * in case the path buffer would overflow, the pointer is decremented further
94 * as if output was written to the buffer, though no more output is actually
95 * generated. that way, the caller can determine how much space would be
96 * required for the path to fit into the buffer. in that case, the returned
97 * value will be smaller than dest. callers must check this!
98 */
99static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
100 struct btrfs_inode_ref *iref,
101 struct extent_buffer *eb_in, u64 parent,
102 char *dest, u32 size)
103{
104 u32 len;
105 int slot;
106 u64 next_inum;
107 int ret;
108 s64 bytes_left = size - 1;
109 struct extent_buffer *eb = eb_in;
110 struct btrfs_key found_key;
111
112 if (bytes_left >= 0)
113 dest[bytes_left] = '\0';
114
115 while (1) {
116 len = btrfs_inode_ref_name_len(eb, iref);
117 bytes_left -= len;
118 if (bytes_left >= 0)
119 read_extent_buffer(eb, dest + bytes_left,
120 (unsigned long)(iref + 1), len);
121 if (eb != eb_in)
122 free_extent_buffer(eb);
123 ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
124 if (ret)
125 break;
126 next_inum = found_key.offset;
127
128 /* regular exit ahead */
129 if (parent == next_inum)
130 break;
131
132 slot = path->slots[0];
133 eb = path->nodes[0];
134 /* make sure we can use eb after releasing the path */
135 if (eb != eb_in)
136 atomic_inc(&eb->refs);
137 btrfs_release_path(path);
138
139 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
140 parent = next_inum;
141 --bytes_left;
142 if (bytes_left >= 0)
143 dest[bytes_left] = '/';
144 }
145
146 btrfs_release_path(path);
147
148 if (ret)
149 return ERR_PTR(ret);
150
151 return dest + bytes_left;
152}
153
154/*
155 * this makes the path point to (logical EXTENT_ITEM *)
156 * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
157 * tree blocks and <0 on error.
158 */
159int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
160 struct btrfs_path *path, struct btrfs_key *found_key)
161{
162 int ret;
163 u64 flags;
164 u32 item_size;
165 struct extent_buffer *eb;
166 struct btrfs_extent_item *ei;
167 struct btrfs_key key;
168
169 key.type = BTRFS_EXTENT_ITEM_KEY;
170 key.objectid = logical;
171 key.offset = (u64)-1;
172
173 ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
174 if (ret < 0)
175 return ret;
176 ret = btrfs_previous_item(fs_info->extent_root, path,
177 0, BTRFS_EXTENT_ITEM_KEY);
178 if (ret < 0)
179 return ret;
180
181 btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
182 if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
183 found_key->objectid > logical ||
184 found_key->objectid + found_key->offset <= logical)
185 return -ENOENT;
186
187 eb = path->nodes[0];
188 item_size = btrfs_item_size_nr(eb, path->slots[0]);
189 BUG_ON(item_size < sizeof(*ei));
190
191 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
192 flags = btrfs_extent_flags(eb, ei);
193
194 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
195 return BTRFS_EXTENT_FLAG_TREE_BLOCK;
196 if (flags & BTRFS_EXTENT_FLAG_DATA)
197 return BTRFS_EXTENT_FLAG_DATA;
198
199 return -EIO;
200}
201
202/*
203 * helper function to iterate extent inline refs. ptr must point to a 0 value
204 * for the first call and may be modified. it is used to track state.
205 * if more refs exist, 0 is returned and the next call to
206 * __get_extent_inline_ref must pass the modified ptr parameter to get the
207 * next ref. after the last ref was processed, 1 is returned.
208 * returns <0 on error
209 */
210static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
211 struct btrfs_extent_item *ei, u32 item_size,
212 struct btrfs_extent_inline_ref **out_eiref,
213 int *out_type)
214{
215 unsigned long end;
216 u64 flags;
217 struct btrfs_tree_block_info *info;
218
219 if (!*ptr) {
220 /* first call */
221 flags = btrfs_extent_flags(eb, ei);
222 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
223 info = (struct btrfs_tree_block_info *)(ei + 1);
224 *out_eiref =
225 (struct btrfs_extent_inline_ref *)(info + 1);
226 } else {
227 *out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
228 }
229 *ptr = (unsigned long)*out_eiref;
230 if ((void *)*ptr >= (void *)ei + item_size)
231 return -ENOENT;
232 }
233
234 end = (unsigned long)ei + item_size;
235 *out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
236 *out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
237
238 *ptr += btrfs_extent_inline_ref_size(*out_type);
239 WARN_ON(*ptr > end);
240 if (*ptr == end)
241 return 1; /* last */
242
243 return 0;
244}
245
246/*
247 * reads the tree block backref for an extent. tree level and root are returned
248 * through out_level and out_root. ptr must point to a 0 value for the first
249 * call and may be modified (see __get_extent_inline_ref comment).
250 * returns 0 if data was provided, 1 if there was no more data to provide or
251 * <0 on error.
252 */
253int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
254 struct btrfs_extent_item *ei, u32 item_size,
255 u64 *out_root, u8 *out_level)
256{
257 int ret;
258 int type;
259 struct btrfs_tree_block_info *info;
260 struct btrfs_extent_inline_ref *eiref;
261
262 if (*ptr == (unsigned long)-1)
263 return 1;
264
265 while (1) {
266 ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
267 &eiref, &type);
268 if (ret < 0)
269 return ret;
270
271 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
272 type == BTRFS_SHARED_BLOCK_REF_KEY)
273 break;
274
275 if (ret == 1)
276 return 1;
277 }
278
279 /* we can treat both ref types equally here */
280 info = (struct btrfs_tree_block_info *)(ei + 1);
281 *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
282 *out_level = btrfs_tree_block_level(eb, info);
283
284 if (ret == 1)
285 *ptr = (unsigned long)-1;
286
287 return 0;
288}
289
290static int __data_list_add(struct list_head *head, u64 inum,
291 u64 extent_data_item_offset, u64 root)
292{
293 struct __data_ref *ref;
294
295 ref = kmalloc(sizeof(*ref), GFP_NOFS);
296 if (!ref)
297 return -ENOMEM;
298
299 ref->inum = inum;
300 ref->extent_data_item_offset = extent_data_item_offset;
301 ref->root = root;
302 list_add_tail(&ref->list, head);
303
304 return 0;
305}
306
307static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
308 struct btrfs_extent_data_ref *dref)
309{
310 return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
311 btrfs_extent_data_ref_offset(eb, dref),
312 btrfs_extent_data_ref_root(eb, dref));
313}
314
315static int __shared_list_add(struct list_head *head, u64 disk_byte)
316{
317 struct __shared_ref *ref;
318
319 ref = kmalloc(sizeof(*ref), GFP_NOFS);
320 if (!ref)
321 return -ENOMEM;
322
323 ref->disk_byte = disk_byte;
324 list_add_tail(&ref->list, head);
325
326 return 0;
327}
328
329static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
330 u64 logical, u64 inum,
331 u64 extent_data_item_offset,
332 u64 extent_offset,
333 struct btrfs_path *path,
334 struct list_head *data_refs,
335 iterate_extent_inodes_t *iterate,
336 void *ctx)
337{
338 u64 ref_root;
339 u32 item_size;
340 struct btrfs_key key;
341 struct extent_buffer *eb;
342 struct btrfs_extent_item *ei;
343 struct btrfs_extent_inline_ref *eiref;
344 struct __data_ref *ref;
345 int ret;
346 int type;
347 int last;
348 unsigned long ptr = 0;
349
350 WARN_ON(!list_empty(data_refs));
351 ret = extent_from_logical(fs_info, logical, path, &key);
352 if (ret & BTRFS_EXTENT_FLAG_DATA)
353 ret = -EIO;
354 if (ret < 0)
355 goto out;
356
357 eb = path->nodes[0];
358 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
359 item_size = btrfs_item_size_nr(eb, path->slots[0]);
360
361 ret = 0;
362 ref_root = 0;
363 /*
364 * as done in iterate_extent_inodes, we first build a list of refs to
365 * iterate, then free the path and then iterate them to avoid deadlocks.
366 */
367 do {
368 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
369 &eiref, &type);
370 if (last < 0) {
371 ret = last;
372 goto out;
373 }
374 if (type == BTRFS_TREE_BLOCK_REF_KEY ||
375 type == BTRFS_SHARED_BLOCK_REF_KEY) {
376 ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
377 ret = __data_list_add(data_refs, inum,
378 extent_data_item_offset,
379 ref_root);
380 }
381 } while (!ret && !last);
382
383 btrfs_release_path(path);
384
385 if (ref_root == 0) {
386 printk(KERN_ERR "btrfs: failed to find tree block ref "
387 "for shared data backref %llu\n", logical);
388 WARN_ON(1);
389 ret = -EIO;
390 }
391
392out:
393 while (!list_empty(data_refs)) {
394 ref = list_first_entry(data_refs, struct __data_ref, list);
395 list_del(&ref->list);
396 if (!ret)
397 ret = iterate(ref->inum, extent_offset +
398 ref->extent_data_item_offset,
399 ref->root, ctx);
400 kfree(ref);
401 }
402
403 return ret;
404}
405
406static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
407 u64 logical, u64 orig_extent_item_objectid,
408 u64 extent_offset, struct btrfs_path *path,
409 struct list_head *data_refs,
410 iterate_extent_inodes_t *iterate,
411 void *ctx)
412{
413 u64 disk_byte;
414 struct btrfs_key key;
415 struct btrfs_file_extent_item *fi;
416 struct extent_buffer *eb;
417 int slot;
418 int nritems;
419 int ret;
420 int found = 0;
421
422 eb = read_tree_block(fs_info->tree_root, logical,
423 fs_info->tree_root->leafsize, 0);
424 if (!eb)
425 return -EIO;
426
427 /*
428 * from the shared data ref, we only have the leaf but we need
429 * the key. thus, we must look into all items and see that we
430 * find one (some) with a reference to our extent item.
431 */
432 nritems = btrfs_header_nritems(eb);
433 for (slot = 0; slot < nritems; ++slot) {
434 btrfs_item_key_to_cpu(eb, &key, slot);
435 if (key.type != BTRFS_EXTENT_DATA_KEY)
436 continue;
437 fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
438 if (!fi) {
439 free_extent_buffer(eb);
440 return -EIO;
441 }
442 disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
443 if (disk_byte != orig_extent_item_objectid) {
444 if (found)
445 break;
446 else
447 continue;
448 }
449 ++found;
450 ret = __iter_shared_inline_ref_inodes(fs_info, logical,
451 key.objectid,
452 key.offset,
453 extent_offset, path,
454 data_refs,
455 iterate, ctx);
456 if (ret)
457 break;
458 }
459
460 if (!found) {
461 printk(KERN_ERR "btrfs: failed to follow shared data backref "
462 "to parent %llu\n", logical);
463 WARN_ON(1);
464 ret = -EIO;
465 }
466
467 free_extent_buffer(eb);
468 return ret;
469}
470
471/*
472 * calls iterate() for every inode that references the extent identified by
473 * the given parameters. will use the path given as a parameter and return it
474 * released.
475 * when the iterator function returns a non-zero value, iteration stops.
476 */
477int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
478 struct btrfs_path *path,
479 u64 extent_item_objectid,
480 u64 extent_offset,
481 iterate_extent_inodes_t *iterate, void *ctx)
482{
483 unsigned long ptr = 0;
484 int last;
485 int ret;
486 int type;
487 u64 logical;
488 u32 item_size;
489 struct btrfs_extent_inline_ref *eiref;
490 struct btrfs_extent_data_ref *dref;
491 struct extent_buffer *eb;
492 struct btrfs_extent_item *ei;
493 struct btrfs_key key;
494 struct list_head data_refs = LIST_HEAD_INIT(data_refs);
495 struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
496 struct __data_ref *ref_d;
497 struct __shared_ref *ref_s;
498
499 eb = path->nodes[0];
500 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
501 item_size = btrfs_item_size_nr(eb, path->slots[0]);
502
503 /* first we iterate the inline refs, ... */
504 do {
505 last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
506 &eiref, &type);
507 if (last == -ENOENT) {
508 ret = 0;
509 break;
510 }
511 if (last < 0) {
512 ret = last;
513 break;
514 }
515
516 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
517 dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
518 ret = __data_list_add_eb(&data_refs, eb, dref);
519 } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
520 logical = btrfs_extent_inline_ref_offset(eb, eiref);
521 ret = __shared_list_add(&shared_refs, logical);
522 }
523 } while (!ret && !last);
524
525 /* ... then we proceed to in-tree references and ... */
526 while (!ret) {
527 ++path->slots[0];
528 if (path->slots[0] > btrfs_header_nritems(eb)) {
529 ret = btrfs_next_leaf(fs_info->extent_root, path);
530 if (ret) {
531 if (ret == 1)
532 ret = 0; /* we're done */
533 break;
534 }
535 eb = path->nodes[0];
536 }
537 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
538 if (key.objectid != extent_item_objectid)
539 break;
540 if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
541 dref = btrfs_item_ptr(eb, path->slots[0],
542 struct btrfs_extent_data_ref);
543 ret = __data_list_add_eb(&data_refs, eb, dref);
544 } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
545 ret = __shared_list_add(&shared_refs, key.offset);
546 }
547 }
548
549 btrfs_release_path(path);
550
551 /*
552 * ... only at the very end we can process the refs we found. this is
553 * because the iterator function we call is allowed to make tree lookups
554 * and we have to avoid deadlocks. additionally, we need more tree
555 * lookups ourselves for shared data refs.
556 */
557 while (!list_empty(&data_refs)) {
558 ref_d = list_first_entry(&data_refs, struct __data_ref, list);
559 list_del(&ref_d->list);
560 if (!ret)
561 ret = iterate(ref_d->inum, extent_offset +
562 ref_d->extent_data_item_offset,
563 ref_d->root, ctx);
564 kfree(ref_d);
565 }
566
567 while (!list_empty(&shared_refs)) {
568 ref_s = list_first_entry(&shared_refs, struct __shared_ref,
569 list);
570 list_del(&ref_s->list);
571 if (!ret)
572 ret = __iter_shared_inline_ref(fs_info,
573 ref_s->disk_byte,
574 extent_item_objectid,
575 extent_offset, path,
576 &data_refs,
577 iterate, ctx);
578 kfree(ref_s);
579 }
580
581 return ret;
582}
583
584int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
585 struct btrfs_path *path,
586 iterate_extent_inodes_t *iterate, void *ctx)
587{
588 int ret;
589 u64 offset;
590 struct btrfs_key found_key;
591
592 ret = extent_from_logical(fs_info, logical, path,
593 &found_key);
594 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
595 ret = -EINVAL;
596 if (ret < 0)
597 return ret;
598
599 offset = logical - found_key.objectid;
600 ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
601 offset, iterate, ctx);
602
603 return ret;
604}
605
606static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
607 struct btrfs_path *path,
608 iterate_irefs_t *iterate, void *ctx)
609{
610 int ret;
611 int slot;
612 u32 cur;
613 u32 len;
614 u32 name_len;
615 u64 parent = 0;
616 int found = 0;
617 struct extent_buffer *eb;
618 struct btrfs_item *item;
619 struct btrfs_inode_ref *iref;
620 struct btrfs_key found_key;
621
622 while (1) {
623 ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
624 &found_key);
625 if (ret < 0)
626 break;
627 if (ret) {
628 ret = found ? 0 : -ENOENT;
629 break;
630 }
631 ++found;
632
633 parent = found_key.offset;
634 slot = path->slots[0];
635 eb = path->nodes[0];
636 /* make sure we can use eb after releasing the path */
637 atomic_inc(&eb->refs);
638 btrfs_release_path(path);
639
640 item = btrfs_item_nr(eb, slot);
641 iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
642
643 for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
644 name_len = btrfs_inode_ref_name_len(eb, iref);
645 /* path must be released before calling iterate()! */
646 ret = iterate(parent, iref, eb, ctx);
647 if (ret) {
648 free_extent_buffer(eb);
649 break;
650 }
651 len = sizeof(*iref) + name_len;
652 iref = (struct btrfs_inode_ref *)((char *)iref + len);
653 }
654 free_extent_buffer(eb);
655 }
656
657 btrfs_release_path(path);
658
659 return ret;
660}
661
662/*
663 * returns 0 if the path could be dumped (probably truncated)
664 * returns <0 in case of an error
665 */
666static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
667 struct extent_buffer *eb, void *ctx)
668{
669 struct inode_fs_paths *ipath = ctx;
670 char *fspath;
671 char *fspath_min;
672 int i = ipath->fspath->elem_cnt;
673 const int s_ptr = sizeof(char *);
674 u32 bytes_left;
675
676 bytes_left = ipath->fspath->bytes_left > s_ptr ?
677 ipath->fspath->bytes_left - s_ptr : 0;
678
679 fspath_min = (char *)ipath->fspath->str + (i + 1) * s_ptr;
680 fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
681 inum, fspath_min, bytes_left);
682 if (IS_ERR(fspath))
683 return PTR_ERR(fspath);
684
685 if (fspath > fspath_min) {
686 ipath->fspath->str[i] = fspath;
687 ++ipath->fspath->elem_cnt;
688 ipath->fspath->bytes_left = fspath - fspath_min;
689 } else {
690 ++ipath->fspath->elem_missed;
691 ipath->fspath->bytes_missing += fspath_min - fspath;
692 ipath->fspath->bytes_left = 0;
693 }
694
695 return 0;
696}
697
698/*
699 * this dumps all file system paths to the inode into the ipath struct, provided
700 * is has been created large enough. each path is zero-terminated and accessed
701 * from ipath->fspath->str[i].
702 * when it returns, there are ipath->fspath->elem_cnt number of paths available
703 * in ipath->fspath->str[]. when the allocated space wasn't sufficient, the
704 * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
705 * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
706 * have been needed to return all paths.
707 */
708int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
709{
710 return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
711 inode_to_path, ipath);
712}
713
714/*
715 * allocates space to return multiple file system paths for an inode.
716 * total_bytes to allocate are passed, note that space usable for actual path
717 * information will be total_bytes - sizeof(struct inode_fs_paths).
718 * the returned pointer must be freed with free_ipath() in the end.
719 */
720struct btrfs_data_container *init_data_container(u32 total_bytes)
721{
722 struct btrfs_data_container *data;
723 size_t alloc_bytes;
724
725 alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
726 data = kmalloc(alloc_bytes, GFP_NOFS);
727 if (!data)
728 return ERR_PTR(-ENOMEM);
729
730 if (total_bytes >= sizeof(*data)) {
731 data->bytes_left = total_bytes - sizeof(*data);
732 data->bytes_missing = 0;
733 } else {
734 data->bytes_missing = sizeof(*data) - total_bytes;
735 data->bytes_left = 0;
736 }
737
738 data->elem_cnt = 0;
739 data->elem_missed = 0;
740
741 return data;
742}
743
744/*
745 * allocates space to return multiple file system paths for an inode.
746 * total_bytes to allocate are passed, note that space usable for actual path
747 * information will be total_bytes - sizeof(struct inode_fs_paths).
748 * the returned pointer must be freed with free_ipath() in the end.
749 */
750struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
751 struct btrfs_path *path)
752{
753 struct inode_fs_paths *ifp;
754 struct btrfs_data_container *fspath;
755
756 fspath = init_data_container(total_bytes);
757 if (IS_ERR(fspath))
758 return (void *)fspath;
759
760 ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
761 if (!ifp) {
762 kfree(fspath);
763 return ERR_PTR(-ENOMEM);
764 }
765
766 ifp->btrfs_path = path;
767 ifp->fspath = fspath;
768 ifp->fs_root = fs_root;
769
770 return ifp;
771}
772
773void free_ipath(struct inode_fs_paths *ipath)
774{
775 kfree(ipath);
776}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
1/*
2 * Copyright (C) 2011 STRATO. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public
6 * License v2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 *
13 * You should have received a copy of the GNU General Public
14 * License along with this program; if not, write to the
15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16 * Boston, MA 021110-1307, USA.
17 */
18
19#ifndef __BTRFS_BACKREF__
20#define __BTRFS_BACKREF__
21
22#include "ioctl.h"
23
24struct inode_fs_paths {
25 struct btrfs_path *btrfs_path;
26 struct btrfs_root *fs_root;
27 struct btrfs_data_container *fspath;
28};
29
30typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
31 void *ctx);
32typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
33 struct extent_buffer *eb, void *ctx);
34
35int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36 struct btrfs_path *path);
37
38int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39 struct btrfs_path *path, struct btrfs_key *found_key);
40
41int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
42 struct btrfs_extent_item *ei, u32 item_size,
43 u64 *out_root, u8 *out_level);
44
45int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
46 struct btrfs_path *path,
47 u64 extent_item_objectid,
48 u64 extent_offset,
49 iterate_extent_inodes_t *iterate, void *ctx);
50
51int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52 struct btrfs_path *path,
53 iterate_extent_inodes_t *iterate, void *ctx);
54
55int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56
57struct btrfs_data_container *init_data_container(u32 total_bytes);
58struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
59 struct btrfs_path *path);
60void free_ipath(struct inode_fs_paths *ipath);
61
62#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index cedfbfb278eb..0eb1f0951251 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -620,7 +620,7 @@ out:
620 620
621static int btree_io_failed_hook(struct bio *failed_bio, 621static int btree_io_failed_hook(struct bio *failed_bio,
622 struct page *page, u64 start, u64 end, 622 struct page *page, u64 start, u64 end,
623 struct extent_state *state) 623 u64 mirror_num, struct extent_state *state)
624{ 624{
625 struct extent_io_tree *tree; 625 struct extent_io_tree *tree;
626 unsigned long len; 626 unsigned long len;
@@ -944,7 +944,7 @@ static int btree_readpage(struct file *file, struct page *page)
944{ 944{
945 struct extent_io_tree *tree; 945 struct extent_io_tree *tree;
946 tree = &BTRFS_I(page->mapping->host)->io_tree; 946 tree = &BTRFS_I(page->mapping->host)->io_tree;
947 return extent_read_full_page(tree, page, btree_get_extent); 947 return extent_read_full_page(tree, page, btree_get_extent, 0);
948} 948}
949 949
950static int btree_releasepage(struct page *page, gfp_t gfp_flags) 950static int btree_releasepage(struct page *page, gfp_t gfp_flags)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 23e936c3de76..18ea90c8943b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1788,18 +1788,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1788{ 1788{
1789 int ret; 1789 int ret;
1790 u64 discarded_bytes = 0; 1790 u64 discarded_bytes = 0;
1791 struct btrfs_multi_bio *multi = NULL; 1791 struct btrfs_bio *bbio = NULL;
1792 1792
1793 1793
1794 /* Tell the block device(s) that the sectors can be discarded */ 1794 /* Tell the block device(s) that the sectors can be discarded */
1795 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD, 1795 ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
1796 bytenr, &num_bytes, &multi, 0); 1796 bytenr, &num_bytes, &bbio, 0);
1797 if (!ret) { 1797 if (!ret) {
1798 struct btrfs_bio_stripe *stripe = multi->stripes; 1798 struct btrfs_bio_stripe *stripe = bbio->stripes;
1799 int i; 1799 int i;
1800 1800
1801 1801
1802 for (i = 0; i < multi->num_stripes; i++, stripe++) { 1802 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1803 if (!stripe->dev->can_discard) 1803 if (!stripe->dev->can_discard)
1804 continue; 1804 continue;
1805 1805
@@ -1818,7 +1818,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1818 */ 1818 */
1819 ret = 0; 1819 ret = 0;
1820 } 1820 }
1821 kfree(multi); 1821 kfree(bbio);
1822 } 1822 }
1823 1823
1824 if (actual_bytes) 1824 if (actual_bytes)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c12705682c65..1f87c4d0e7a0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -1787,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1787 return 0; 1788 return 0;
1788} 1789}
1789 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1790/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1791 2154
1792/* 2155/*
@@ -1885,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1885 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1886 struct extent_state *state; 2249 struct extent_state *state;
1887 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1888 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1889 2255
1890 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1915,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1915 state); 2281 state);
1916 if (ret) 2282 if (ret)
1917 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1918 } 2286 }
1919 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1920 tree->ops->readpage_io_failed_hook) { 2288 u64 failed_mirror;
1921 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (u64)bio->bi_bdev;
1922 start, end, state); 2290 if (tree->ops && tree->ops->readpage_io_failed_hook)
2291 ret = tree->ops->readpage_io_failed_hook(
2292 bio, page, start, end,
2293 failed_mirror, state);
2294 else
2295 ret = bio_readpage_error(bio, page, start, end,
2296 failed_mirror, NULL);
1923 if (ret == 0) { 2297 if (ret == 0) {
1924 uptodate = 2298 uptodate =
1925 test_bit(BIO_UPTODATE, &bio->bi_flags); 2299 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1999,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1999 mirror_num, bio_flags, start); 2373 mirror_num, bio_flags, start);
2000 else 2374 else
2001 submit_bio(rw, bio); 2375 submit_bio(rw, bio);
2376
2002 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2377 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2003 ret = -EOPNOTSUPP; 2378 ret = -EOPNOTSUPP;
2004 bio_put(bio); 2379 bio_put(bio);
@@ -2264,16 +2639,16 @@ out:
2264} 2639}
2265 2640
2266int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2641int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2267 get_extent_t *get_extent) 2642 get_extent_t *get_extent, int mirror_num)
2268{ 2643{
2269 struct bio *bio = NULL; 2644 struct bio *bio = NULL;
2270 unsigned long bio_flags = 0; 2645 unsigned long bio_flags = 0;
2271 int ret; 2646 int ret;
2272 2647
2273 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2648 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2274 &bio_flags); 2649 &bio_flags);
2275 if (bio) 2650 if (bio)
2276 ret = submit_one_bio(READ, bio, 0, bio_flags); 2651 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2277 return ret; 2652 return ret;
2278} 2653}
2279 2654
@@ -3127,7 +3502,7 @@ out:
3127 return ret; 3502 return ret;
3128} 3503}
3129 3504
3130static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3505inline struct page *extent_buffer_page(struct extent_buffer *eb,
3131 unsigned long i) 3506 unsigned long i)
3132{ 3507{
3133 struct page *p; 3508 struct page *p;
@@ -3152,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
3152 return p; 3527 return p;
3153} 3528}
3154 3529
3155static inline unsigned long num_extent_pages(u64 start, u64 len) 3530inline unsigned long num_extent_pages(u64 start, u64 len)
3156{ 3531{
3157 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3532 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3158 (start >> PAGE_CACHE_SHIFT); 3533 (start >> PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 697570eed9e8..feb9be0e23bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -18,6 +18,7 @@
18#define EXTENT_DO_ACCOUNTING (1 << 11) 18#define EXTENT_DO_ACCOUNTING (1 << 11)
19#define EXTENT_FIRST_DELALLOC (1 << 12) 19#define EXTENT_FIRST_DELALLOC (1 << 12)
20#define EXTENT_NEED_WAIT (1 << 13) 20#define EXTENT_NEED_WAIT (1 << 13)
21#define EXTENT_DAMAGED (1 << 14)
21#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) 22#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
22#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) 23#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
23 24
@@ -69,7 +70,7 @@ struct extent_io_ops {
69 unsigned long bio_flags); 70 unsigned long bio_flags);
70 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 71 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
71 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 72 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
72 u64 start, u64 end, 73 u64 start, u64 end, u64 failed_mirror,
73 struct extent_state *state); 74 struct extent_state *state);
74 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 75 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
75 u64 start, u64 end, 76 u64 start, u64 end,
@@ -188,7 +189,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
188int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 189int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
189 gfp_t mask); 190 gfp_t mask);
190int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 191int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
191 get_extent_t *get_extent); 192 get_extent_t *get_extent, int mirror_num);
192int __init extent_io_init(void); 193int __init extent_io_init(void);
193void extent_io_exit(void); 194void extent_io_exit(void);
194 195
@@ -259,6 +260,8 @@ void free_extent_buffer(struct extent_buffer *eb);
259int read_extent_buffer_pages(struct extent_io_tree *tree, 260int read_extent_buffer_pages(struct extent_io_tree *tree,
260 struct extent_buffer *eb, u64 start, int wait, 261 struct extent_buffer *eb, u64 start, int wait,
261 get_extent_t *get_extent, int mirror_num); 262 get_extent_t *get_extent, int mirror_num);
263unsigned long num_extent_pages(u64 start, u64 len);
264struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
262 265
263static inline void extent_buffer_get(struct extent_buffer *eb) 266static inline void extent_buffer_get(struct extent_buffer *eb)
264{ 267{
@@ -308,4 +311,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
308struct bio * 311struct bio *
309btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 312btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
310 gfp_t gfp_flags); 313 gfp_t gfp_flags);
314
315struct btrfs_mapping_tree;
316
317int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
318 u64 length, u64 logical, struct page *page,
319 int mirror_num);
311#endif 320#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b6b612e14ed7..9d0eaa57d4ee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
45#include "btrfs_inode.h" 45#include "btrfs_inode.h"
46#include "ioctl.h" 46#include "ioctl.h"
47#include "print-tree.h" 47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 48#include "ordered-data.h"
50#include "xattr.h" 49#include "xattr.h"
51#include "tree-log.h" 50#include "tree-log.h"
51#include "volumes.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h" 54#include "free-space-cache.h"
@@ -1823,153 +1823,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1823} 1823}
1824 1824
1825/* 1825/*
1826 * When IO fails, either with EIO or csum verification fails, we
1827 * try other mirrors that might have a good copy of the data. This
1828 * io_failure_record is used to record state as we go through all the
1829 * mirrors. If another mirror has good data, the page is set up to date
1830 * and things continue. If a good mirror can't be found, the original
1831 * bio end_io callback is called to indicate things have failed.
1832 */
1833struct io_failure_record {
1834 struct page *page;
1835 u64 start;
1836 u64 len;
1837 u64 logical;
1838 unsigned long bio_flags;
1839 int last_mirror;
1840};
1841
1842static int btrfs_io_failed_hook(struct bio *failed_bio,
1843 struct page *page, u64 start, u64 end,
1844 struct extent_state *state)
1845{
1846 struct io_failure_record *failrec = NULL;
1847 u64 private;
1848 struct extent_map *em;
1849 struct inode *inode = page->mapping->host;
1850 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1851 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1852 struct bio *bio;
1853 int num_copies;
1854 int ret;
1855 int rw;
1856 u64 logical;
1857
1858 ret = get_state_private(failure_tree, start, &private);
1859 if (ret) {
1860 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1861 if (!failrec)
1862 return -ENOMEM;
1863 failrec->start = start;
1864 failrec->len = end - start + 1;
1865 failrec->last_mirror = 0;
1866 failrec->bio_flags = 0;
1867
1868 read_lock(&em_tree->lock);
1869 em = lookup_extent_mapping(em_tree, start, failrec->len);
1870 if (em->start > start || em->start + em->len < start) {
1871 free_extent_map(em);
1872 em = NULL;
1873 }
1874 read_unlock(&em_tree->lock);
1875
1876 if (IS_ERR_OR_NULL(em)) {
1877 kfree(failrec);
1878 return -EIO;
1879 }
1880 logical = start - em->start;
1881 logical = em->block_start + logical;
1882 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1883 logical = em->block_start;
1884 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1885 extent_set_compress_type(&failrec->bio_flags,
1886 em->compress_type);
1887 }
1888 failrec->logical = logical;
1889 free_extent_map(em);
1890 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1891 EXTENT_DIRTY, GFP_NOFS);
1892 set_state_private(failure_tree, start,
1893 (u64)(unsigned long)failrec);
1894 } else {
1895 failrec = (struct io_failure_record *)(unsigned long)private;
1896 }
1897 num_copies = btrfs_num_copies(
1898 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1899 failrec->logical, failrec->len);
1900 failrec->last_mirror++;
1901 if (!state) {
1902 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1903 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1904 failrec->start,
1905 EXTENT_LOCKED);
1906 if (state && state->start != failrec->start)
1907 state = NULL;
1908 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1909 }
1910 if (!state || failrec->last_mirror > num_copies) {
1911 set_state_private(failure_tree, failrec->start, 0);
1912 clear_extent_bits(failure_tree, failrec->start,
1913 failrec->start + failrec->len - 1,
1914 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1915 kfree(failrec);
1916 return -EIO;
1917 }
1918 bio = bio_alloc(GFP_NOFS, 1);
1919 bio->bi_private = state;
1920 bio->bi_end_io = failed_bio->bi_end_io;
1921 bio->bi_sector = failrec->logical >> 9;
1922 bio->bi_bdev = failed_bio->bi_bdev;
1923 bio->bi_size = 0;
1924
1925 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1926 if (failed_bio->bi_rw & REQ_WRITE)
1927 rw = WRITE;
1928 else
1929 rw = READ;
1930
1931 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1932 failrec->last_mirror,
1933 failrec->bio_flags, 0);
1934 return ret;
1935}
1936
1937/*
1938 * each time an IO finishes, we do a fast check in the IO failure tree
1939 * to see if we need to process or clean up an io_failure_record
1940 */
1941static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1942{
1943 u64 private;
1944 u64 private_failure;
1945 struct io_failure_record *failure;
1946 int ret;
1947
1948 private = 0;
1949 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1950 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1951 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1952 start, &private_failure);
1953 if (ret == 0) {
1954 failure = (struct io_failure_record *)(unsigned long)
1955 private_failure;
1956 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1957 failure->start, 0);
1958 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1959 failure->start,
1960 failure->start + failure->len - 1,
1961 EXTENT_DIRTY | EXTENT_LOCKED,
1962 GFP_NOFS);
1963 kfree(failure);
1964 }
1965 }
1966 return 0;
1967}
1968
1969/*
1970 * when reads are done, we need to check csums to verify the data is correct 1826 * when reads are done, we need to check csums to verify the data is correct
1971 * if there's a match, we allow the bio to finish. If not, we go through 1827 * if there's a match, we allow the bio to finish. If not, the code in
1972 * the io_failure_record routines to find good copies 1828 * extent_io.c will try to find good copies for us.
1973 */ 1829 */
1974static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1830static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1975 struct extent_state *state) 1831 struct extent_state *state)
@@ -2015,10 +1871,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2015 1871
2016 kunmap_atomic(kaddr, KM_USER0); 1872 kunmap_atomic(kaddr, KM_USER0);
2017good: 1873good:
2018 /* if the io failure tree for this inode is non-empty,
2019 * check to see if we've recovered from a failed IO
2020 */
2021 btrfs_clean_io_failures(inode, start);
2022 return 0; 1874 return 0;
2023 1875
2024zeroit: 1876zeroit:
@@ -6273,7 +6125,7 @@ int btrfs_readpage(struct file *file, struct page *page)
6273{ 6125{
6274 struct extent_io_tree *tree; 6126 struct extent_io_tree *tree;
6275 tree = &BTRFS_I(page->mapping->host)->io_tree; 6127 tree = &BTRFS_I(page->mapping->host)->io_tree;
6276 return extent_read_full_page(tree, page, btrfs_get_extent); 6128 return extent_read_full_page(tree, page, btrfs_get_extent, 0);
6277} 6129}
6278 6130
6279static int btrfs_writepage(struct page *page, struct writeback_control *wbc) 6131static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -7406,7 +7258,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7406 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7258 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7407 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7259 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7408 .writepage_start_hook = btrfs_writepage_start_hook, 7260 .writepage_start_hook = btrfs_writepage_start_hook,
7409 .readpage_io_failed_hook = btrfs_io_failed_hook,
7410 .set_bit_hook = btrfs_set_bit_hook, 7261 .set_bit_hook = btrfs_set_bit_hook,
7411 .clear_bit_hook = btrfs_clear_bit_hook, 7262 .clear_bit_hook = btrfs_clear_bit_hook,
7412 .merge_extent_hook = btrfs_merge_extent_hook, 7263 .merge_extent_hook = btrfs_merge_extent_hook,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8f6e14279409..cc9893990341 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
51#include "volumes.h" 51#include "volumes.h"
52#include "locking.h" 52#include "locking.h"
53#include "inode-map.h" 53#include "inode-map.h"
54#include "backref.h"
54 55
55/* Mask out flags that are inappropriate for the given type of inode. */ 56/* Mask out flags that are inappropriate for the given type of inode. */
56static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 57static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -2890,6 +2891,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
2890 return ret; 2891 return ret;
2891} 2892}
2892 2893
2894static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
2895{
2896 int ret = 0;
2897 int i;
2898 unsigned long rel_ptr;
2899 int size;
2900 struct btrfs_ioctl_ino_path_args *ipa = NULL;
2901 struct inode_fs_paths *ipath = NULL;
2902 struct btrfs_path *path;
2903
2904 if (!capable(CAP_SYS_ADMIN))
2905 return -EPERM;
2906
2907 path = btrfs_alloc_path();
2908 if (!path) {
2909 ret = -ENOMEM;
2910 goto out;
2911 }
2912
2913 ipa = memdup_user(arg, sizeof(*ipa));
2914 if (IS_ERR(ipa)) {
2915 ret = PTR_ERR(ipa);
2916 ipa = NULL;
2917 goto out;
2918 }
2919
2920 size = min_t(u32, ipa->size, 4096);
2921 ipath = init_ipath(size, root, path);
2922 if (IS_ERR(ipath)) {
2923 ret = PTR_ERR(ipath);
2924 ipath = NULL;
2925 goto out;
2926 }
2927
2928 ret = paths_from_inode(ipa->inum, ipath);
2929 if (ret < 0)
2930 goto out;
2931
2932 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
2933 rel_ptr = ipath->fspath->str[i] - (char *)ipath->fspath->str;
2934 ipath->fspath->str[i] = (void *)rel_ptr;
2935 }
2936
2937 ret = copy_to_user(ipa->fspath, ipath->fspath, size);
2938 if (ret) {
2939 ret = -EFAULT;
2940 goto out;
2941 }
2942
2943out:
2944 btrfs_free_path(path);
2945 free_ipath(ipath);
2946 kfree(ipa);
2947
2948 return ret;
2949}
2950
2951static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
2952{
2953 struct btrfs_data_container *inodes = ctx;
2954 const size_t c = 3 * sizeof(u64);
2955
2956 if (inodes->bytes_left >= c) {
2957 inodes->bytes_left -= c;
2958 inodes->val[inodes->elem_cnt] = inum;
2959 inodes->val[inodes->elem_cnt + 1] = offset;
2960 inodes->val[inodes->elem_cnt + 2] = root;
2961 inodes->elem_cnt += 3;
2962 } else {
2963 inodes->bytes_missing += c - inodes->bytes_left;
2964 inodes->bytes_left = 0;
2965 inodes->elem_missed += 3;
2966 }
2967
2968 return 0;
2969}
2970
2971static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
2972 void __user *arg)
2973{
2974 int ret = 0;
2975 int size;
2976 u64 extent_offset;
2977 struct btrfs_ioctl_logical_ino_args *loi;
2978 struct btrfs_data_container *inodes = NULL;
2979 struct btrfs_path *path = NULL;
2980 struct btrfs_key key;
2981
2982 if (!capable(CAP_SYS_ADMIN))
2983 return -EPERM;
2984
2985 loi = memdup_user(arg, sizeof(*loi));
2986 if (IS_ERR(loi)) {
2987 ret = PTR_ERR(loi);
2988 loi = NULL;
2989 goto out;
2990 }
2991
2992 path = btrfs_alloc_path();
2993 if (!path) {
2994 ret = -ENOMEM;
2995 goto out;
2996 }
2997
2998 size = min_t(u32, loi->size, 4096);
2999 inodes = init_data_container(size);
3000 if (IS_ERR(inodes)) {
3001 ret = PTR_ERR(inodes);
3002 inodes = NULL;
3003 goto out;
3004 }
3005
3006 ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
3007
3008 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
3009 ret = -ENOENT;
3010 if (ret < 0)
3011 goto out;
3012
3013 extent_offset = loi->logical - key.objectid;
3014 ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
3015 extent_offset, build_ino_list, inodes);
3016
3017 if (ret < 0)
3018 goto out;
3019
3020 ret = copy_to_user(loi->inodes, inodes, size);
3021 if (ret)
3022 ret = -EFAULT;
3023
3024out:
3025 btrfs_free_path(path);
3026 kfree(inodes);
3027 kfree(loi);
3028
3029 return ret;
3030}
3031
2893long btrfs_ioctl(struct file *file, unsigned int 3032long btrfs_ioctl(struct file *file, unsigned int
2894 cmd, unsigned long arg) 3033 cmd, unsigned long arg)
2895{ 3034{
@@ -2947,6 +3086,10 @@ long btrfs_ioctl(struct file *file, unsigned int
2947 return btrfs_ioctl_tree_search(file, argp); 3086 return btrfs_ioctl_tree_search(file, argp);
2948 case BTRFS_IOC_INO_LOOKUP: 3087 case BTRFS_IOC_INO_LOOKUP:
2949 return btrfs_ioctl_ino_lookup(file, argp); 3088 return btrfs_ioctl_ino_lookup(file, argp);
3089 case BTRFS_IOC_INO_PATHS:
3090 return btrfs_ioctl_ino_to_path(root, argp);
3091 case BTRFS_IOC_LOGICAL_INO:
3092 return btrfs_ioctl_logical_to_ino(root, argp);
2950 case BTRFS_IOC_SPACE_INFO: 3093 case BTRFS_IOC_SPACE_INFO:
2951 return btrfs_ioctl_space_info(root, argp); 3094 return btrfs_ioctl_space_info(root, argp);
2952 case BTRFS_IOC_SYNC: 3095 case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..2da30d4950e6 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,31 @@ struct btrfs_ioctl_space_args {
193 struct btrfs_ioctl_space_info spaces[0]; 193 struct btrfs_ioctl_space_info spaces[0];
194}; 194};
195 195
196struct btrfs_data_container {
197 __u32 bytes_left; /* out -- bytes not needed to deliver output */
198 __u32 bytes_missing; /* out -- additional bytes needed for result */
199 __u32 elem_cnt; /* out */
200 __u32 elem_missed; /* out */
201 union {
202 char *str[0]; /* out */
203 __u64 val[0]; /* out */
204 };
205};
206
207struct btrfs_ioctl_ino_path_args {
208 __u64 inum; /* in */
209 __u32 size; /* in */
210 __u64 reserved[4];
211 struct btrfs_data_container *fspath; /* out */
212};
213
214struct btrfs_ioctl_logical_ino_args {
215 __u64 logical; /* in */
216 __u32 size; /* in */
217 __u64 reserved[4];
218 struct btrfs_data_container *inodes; /* out */
219};
220
196#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 221#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
197 struct btrfs_ioctl_vol_args) 222 struct btrfs_ioctl_vol_args)
198#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 223#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -248,4 +273,9 @@ struct btrfs_ioctl_space_args {
248 struct btrfs_ioctl_dev_info_args) 273 struct btrfs_ioctl_dev_info_args)
249#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \ 274#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
250 struct btrfs_ioctl_fs_info_args) 275 struct btrfs_ioctl_fs_info_args)
276#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
277 struct btrfs_ioctl_ino_path_args)
278#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
279 struct btrfs_ioctl_ino_path_args)
280
251#endif 281#endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 2b701d082227..cd857119ba8a 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -247,7 +247,7 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
247 247
248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, 248static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
249 struct btrfs_device *dev, u64 logical, 249 struct btrfs_device *dev, u64 logical,
250 struct btrfs_multi_bio *multi) 250 struct btrfs_bio *multi)
251{ 251{
252 int ret; 252 int ret;
253 int looped = 0; 253 int looped = 0;
@@ -327,7 +327,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
327 struct reada_extent *re = NULL; 327 struct reada_extent *re = NULL;
328 struct btrfs_fs_info *fs_info = root->fs_info; 328 struct btrfs_fs_info *fs_info = root->fs_info;
329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 329 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
330 struct btrfs_multi_bio *multi = NULL; 330 struct btrfs_bio *multi = NULL;
331 struct btrfs_device *dev; 331 struct btrfs_device *dev;
332 u32 blocksize; 332 u32 blocksize;
333 u64 length; 333 u64 length;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 5bc4ec827b3d..94cd3a19e9c8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,14 @@
17 */ 17 */
18 18
19#include <linux/blkdev.h> 19#include <linux/blkdev.h>
20#include <linux/ratelimit.h>
20#include "ctree.h" 21#include "ctree.h"
21#include "volumes.h" 22#include "volumes.h"
22#include "disk-io.h" 23#include "disk-io.h"
23#include "ordered-data.h" 24#include "ordered-data.h"
25#include "transaction.h"
26#include "backref.h"
27#include "extent_io.h"
24 28
25/* 29/*
26 * This is only the first step towards a full-features scrub. It reads all 30 * This is only the first step towards a full-features scrub. It reads all
@@ -60,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
60struct scrub_page { 64struct scrub_page {
61 u64 flags; /* extent flags */ 65 u64 flags; /* extent flags */
62 u64 generation; 66 u64 generation;
63 u64 mirror_num; 67 int mirror_num;
64 int have_csum; 68 int have_csum;
65 u8 csum[BTRFS_CSUM_SIZE]; 69 u8 csum[BTRFS_CSUM_SIZE];
66}; 70};
@@ -84,6 +88,7 @@ struct scrub_dev {
84 int first_free; 88 int first_free;
85 int curr; 89 int curr;
86 atomic_t in_flight; 90 atomic_t in_flight;
91 atomic_t fixup_cnt;
87 spinlock_t list_lock; 92 spinlock_t list_lock;
88 wait_queue_head_t list_wait; 93 wait_queue_head_t list_wait;
89 u16 csum_size; 94 u16 csum_size;
@@ -97,6 +102,27 @@ struct scrub_dev {
97 spinlock_t stat_lock; 102 spinlock_t stat_lock;
98}; 103};
99 104
105struct scrub_fixup_nodatasum {
106 struct scrub_dev *sdev;
107 u64 logical;
108 struct btrfs_root *root;
109 struct btrfs_work work;
110 int mirror_num;
111};
112
113struct scrub_warning {
114 struct btrfs_path *path;
115 u64 extent_item_size;
116 char *scratch_buf;
117 char *msg_buf;
118 const char *errstr;
119 sector_t sector;
120 u64 logical;
121 struct btrfs_device *dev;
122 int msg_bufsize;
123 int scratch_bufsize;
124};
125
100static void scrub_free_csums(struct scrub_dev *sdev) 126static void scrub_free_csums(struct scrub_dev *sdev)
101{ 127{
102 while (!list_empty(&sdev->csum_list)) { 128 while (!list_empty(&sdev->csum_list)) {
@@ -172,12 +198,13 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
172 198
173 if (i != SCRUB_BIOS_PER_DEV-1) 199 if (i != SCRUB_BIOS_PER_DEV-1)
174 sdev->bios[i]->next_free = i + 1; 200 sdev->bios[i]->next_free = i + 1;
175 else 201 else
176 sdev->bios[i]->next_free = -1; 202 sdev->bios[i]->next_free = -1;
177 } 203 }
178 sdev->first_free = 0; 204 sdev->first_free = 0;
179 sdev->curr = -1; 205 sdev->curr = -1;
180 atomic_set(&sdev->in_flight, 0); 206 atomic_set(&sdev->in_flight, 0);
207 atomic_set(&sdev->fixup_cnt, 0);
181 atomic_set(&sdev->cancel_req, 0); 208 atomic_set(&sdev->cancel_req, 0);
182 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); 209 sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
183 INIT_LIST_HEAD(&sdev->csum_list); 210 INIT_LIST_HEAD(&sdev->csum_list);
@@ -192,24 +219,361 @@ nomem:
192 return ERR_PTR(-ENOMEM); 219 return ERR_PTR(-ENOMEM);
193} 220}
194 221
222static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
223{
224 u64 isize;
225 u32 nlink;
226 int ret;
227 int i;
228 struct extent_buffer *eb;
229 struct btrfs_inode_item *inode_item;
230 struct scrub_warning *swarn = ctx;
231 struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
232 struct inode_fs_paths *ipath = NULL;
233 struct btrfs_root *local_root;
234 struct btrfs_key root_key;
235
236 root_key.objectid = root;
237 root_key.type = BTRFS_ROOT_ITEM_KEY;
238 root_key.offset = (u64)-1;
239 local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
240 if (IS_ERR(local_root)) {
241 ret = PTR_ERR(local_root);
242 goto err;
243 }
244
245 ret = inode_item_info(inum, 0, local_root, swarn->path);
246 if (ret) {
247 btrfs_release_path(swarn->path);
248 goto err;
249 }
250
251 eb = swarn->path->nodes[0];
252 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
253 struct btrfs_inode_item);
254 isize = btrfs_inode_size(eb, inode_item);
255 nlink = btrfs_inode_nlink(eb, inode_item);
256 btrfs_release_path(swarn->path);
257
258 ipath = init_ipath(4096, local_root, swarn->path);
259 ret = paths_from_inode(inum, ipath);
260
261 if (ret < 0)
262 goto err;
263
264 /*
265 * we deliberately ignore the bit ipath might have been too small to
266 * hold all of the paths here
267 */
268 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
269 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
270 "%s, sector %llu, root %llu, inode %llu, offset %llu, "
271 "length %llu, links %u (path: %s)\n", swarn->errstr,
272 swarn->logical, swarn->dev->name,
273 (unsigned long long)swarn->sector, root, inum, offset,
274 min(isize - offset, (u64)PAGE_SIZE), nlink,
275 ipath->fspath->str[i]);
276
277 free_ipath(ipath);
278 return 0;
279
280err:
281 printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
282 "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
283 "resolving failed with ret=%d\n", swarn->errstr,
284 swarn->logical, swarn->dev->name,
285 (unsigned long long)swarn->sector, root, inum, offset, ret);
286
287 free_ipath(ipath);
288 return 0;
289}
290
291static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
292 int ix)
293{
294 struct btrfs_device *dev = sbio->sdev->dev;
295 struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
296 struct btrfs_path *path;
297 struct btrfs_key found_key;
298 struct extent_buffer *eb;
299 struct btrfs_extent_item *ei;
300 struct scrub_warning swarn;
301 u32 item_size;
302 int ret;
303 u64 ref_root;
304 u8 ref_level;
305 unsigned long ptr = 0;
306 const int bufsize = 4096;
307 u64 extent_offset;
308
309 path = btrfs_alloc_path();
310
311 swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
312 swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
313 swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
314 swarn.logical = sbio->logical + ix * PAGE_SIZE;
315 swarn.errstr = errstr;
316 swarn.dev = dev;
317 swarn.msg_bufsize = bufsize;
318 swarn.scratch_bufsize = bufsize;
319
320 if (!path || !swarn.scratch_buf || !swarn.msg_buf)
321 goto out;
322
323 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
324 if (ret < 0)
325 goto out;
326
327 extent_offset = swarn.logical - found_key.objectid;
328 swarn.extent_item_size = found_key.offset;
329
330 eb = path->nodes[0];
331 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
332 item_size = btrfs_item_size_nr(eb, path->slots[0]);
333
334 if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
335 do {
336 ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
337 &ref_root, &ref_level);
338 printk(KERN_WARNING "%s at logical %llu on dev %s, "
339 "sector %llu: metadata %s (level %d) in tree "
340 "%llu\n", errstr, swarn.logical, dev->name,
341 (unsigned long long)swarn.sector,
342 ref_level ? "node" : "leaf",
343 ret < 0 ? -1 : ref_level,
344 ret < 0 ? -1 : ref_root);
345 } while (ret != 1);
346 } else {
347 swarn.path = path;
348 iterate_extent_inodes(fs_info, path, found_key.objectid,
349 extent_offset,
350 scrub_print_warning_inode, &swarn);
351 }
352
353out:
354 btrfs_free_path(path);
355 kfree(swarn.scratch_buf);
356 kfree(swarn.msg_buf);
357}
358
359static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
360{
361 struct page *page = NULL;
362 unsigned long index;
363 struct scrub_fixup_nodatasum *fixup = ctx;
364 int ret;
365 int corrected = 0;
366 struct btrfs_key key;
367 struct inode *inode = NULL;
368 u64 end = offset + PAGE_SIZE - 1;
369 struct btrfs_root *local_root;
370
371 key.objectid = root;
372 key.type = BTRFS_ROOT_ITEM_KEY;
373 key.offset = (u64)-1;
374 local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
375 if (IS_ERR(local_root))
376 return PTR_ERR(local_root);
377
378 key.type = BTRFS_INODE_ITEM_KEY;
379 key.objectid = inum;
380 key.offset = 0;
381 inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
382 if (IS_ERR(inode))
383 return PTR_ERR(inode);
384
385 index = offset >> PAGE_CACHE_SHIFT;
386
387 page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
388 if (!page) {
389 ret = -ENOMEM;
390 goto out;
391 }
392
393 if (PageUptodate(page)) {
394 struct btrfs_mapping_tree *map_tree;
395 if (PageDirty(page)) {
396 /*
397 * we need to write the data to the defect sector. the
398 * data that was in that sector is not in memory,
399 * because the page was modified. we must not write the
400 * modified page to that sector.
401 *
402 * TODO: what could be done here: wait for the delalloc
403 * runner to write out that page (might involve
404 * COW) and see whether the sector is still
405 * referenced afterwards.
406 *
407 * For the meantime, we'll treat this error
408 * incorrectable, although there is a chance that a
409 * later scrub will find the bad sector again and that
410 * there's no dirty page in memory, then.
411 */
412 ret = -EIO;
413 goto out;
414 }
415 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
416 ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
417 fixup->logical, page,
418 fixup->mirror_num);
419 unlock_page(page);
420 corrected = !ret;
421 } else {
422 /*
423 * we need to get good data first. the general readpage path
424 * will call repair_io_failure for us, we just have to make
425 * sure we read the bad mirror.
426 */
427 ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
428 EXTENT_DAMAGED, GFP_NOFS);
429 if (ret) {
430 /* set_extent_bits should give proper error */
431 WARN_ON(ret > 0);
432 if (ret > 0)
433 ret = -EFAULT;
434 goto out;
435 }
436
437 ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
438 btrfs_get_extent,
439 fixup->mirror_num);
440 wait_on_page_locked(page);
441
442 corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
443 end, EXTENT_DAMAGED, 0, NULL);
444 if (!corrected)
445 clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
446 EXTENT_DAMAGED, GFP_NOFS);
447 }
448
449out:
450 if (page)
451 put_page(page);
452 if (inode)
453 iput(inode);
454
455 if (ret < 0)
456 return ret;
457
458 if (ret == 0 && corrected) {
459 /*
460 * we only need to call readpage for one of the inodes belonging
461 * to this extent. so make iterate_extent_inodes stop
462 */
463 return 1;
464 }
465
466 return -EIO;
467}
468
469static void scrub_fixup_nodatasum(struct btrfs_work *work)
470{
471 int ret;
472 struct scrub_fixup_nodatasum *fixup;
473 struct scrub_dev *sdev;
474 struct btrfs_trans_handle *trans = NULL;
475 struct btrfs_fs_info *fs_info;
476 struct btrfs_path *path;
477 int uncorrectable = 0;
478
479 fixup = container_of(work, struct scrub_fixup_nodatasum, work);
480 sdev = fixup->sdev;
481 fs_info = fixup->root->fs_info;
482
483 path = btrfs_alloc_path();
484 if (!path) {
485 spin_lock(&sdev->stat_lock);
486 ++sdev->stat.malloc_errors;
487 spin_unlock(&sdev->stat_lock);
488 uncorrectable = 1;
489 goto out;
490 }
491
492 trans = btrfs_join_transaction(fixup->root);
493 if (IS_ERR(trans)) {
494 uncorrectable = 1;
495 goto out;
496 }
497
498 /*
499 * the idea is to trigger a regular read through the standard path. we
500 * read a page from the (failed) logical address by specifying the
501 * corresponding copynum of the failed sector. thus, that readpage is
502 * expected to fail.
503 * that is the point where on-the-fly error correction will kick in
504 * (once it's finished) and rewrite the failed sector if a good copy
505 * can be found.
506 */
507 ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
508 path, scrub_fixup_readpage,
509 fixup);
510 if (ret < 0) {
511 uncorrectable = 1;
512 goto out;
513 }
514 WARN_ON(ret != 1);
515
516 spin_lock(&sdev->stat_lock);
517 ++sdev->stat.corrected_errors;
518 spin_unlock(&sdev->stat_lock);
519
520out:
521 if (trans && !IS_ERR(trans))
522 btrfs_end_transaction(trans, fixup->root);
523 if (uncorrectable) {
524 spin_lock(&sdev->stat_lock);
525 ++sdev->stat.uncorrectable_errors;
526 spin_unlock(&sdev->stat_lock);
527 printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
528 "(nodatasum) error at logical %llu\n",
529 fixup->logical);
530 }
531
532 btrfs_free_path(path);
533 kfree(fixup);
534
535 /* see caller why we're pretending to be paused in the scrub counters */
536 mutex_lock(&fs_info->scrub_lock);
537 atomic_dec(&fs_info->scrubs_running);
538 atomic_dec(&fs_info->scrubs_paused);
539 mutex_unlock(&fs_info->scrub_lock);
540 atomic_dec(&sdev->fixup_cnt);
541 wake_up(&fs_info->scrub_pause_wait);
542 wake_up(&sdev->list_wait);
543}
544
195/* 545/*
196 * scrub_recheck_error gets called when either verification of the page 546 * scrub_recheck_error gets called when either verification of the page
197 * failed or the bio failed to read, e.g. with EIO. In the latter case, 547 * failed or the bio failed to read, e.g. with EIO. In the latter case,
198 * recheck_error gets called for every page in the bio, even though only 548 * recheck_error gets called for every page in the bio, even though only
199 * one may be bad 549 * one may be bad
200 */ 550 */
201static void scrub_recheck_error(struct scrub_bio *sbio, int ix) 551static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
202{ 552{
553 struct scrub_dev *sdev = sbio->sdev;
554 u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
555 static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
556 DEFAULT_RATELIMIT_BURST);
557
203 if (sbio->err) { 558 if (sbio->err) {
204 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, 559 if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
205 (sbio->physical + ix * PAGE_SIZE) >> 9,
206 sbio->bio->bi_io_vec[ix].bv_page) == 0) { 560 sbio->bio->bi_io_vec[ix].bv_page) == 0) {
207 if (scrub_fixup_check(sbio, ix) == 0) 561 if (scrub_fixup_check(sbio, ix) == 0)
208 return; 562 return 0;
209 } 563 }
564 if (__ratelimit(&_rs))
565 scrub_print_warning("i/o error", sbio, ix);
566 } else {
567 if (__ratelimit(&_rs))
568 scrub_print_warning("checksum error", sbio, ix);
210 } 569 }
211 570
571 spin_lock(&sdev->stat_lock);
572 ++sdev->stat.read_errors;
573 spin_unlock(&sdev->stat_lock);
574
212 scrub_fixup(sbio, ix); 575 scrub_fixup(sbio, ix);
576 return 1;
213} 577}
214 578
215static int scrub_fixup_check(struct scrub_bio *sbio, int ix) 579static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -247,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
247 struct scrub_dev *sdev = sbio->sdev; 611 struct scrub_dev *sdev = sbio->sdev;
248 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; 612 struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
249 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; 613 struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
250 struct btrfs_multi_bio *multi = NULL; 614 struct btrfs_bio *bbio = NULL;
615 struct scrub_fixup_nodatasum *fixup;
251 u64 logical = sbio->logical + ix * PAGE_SIZE; 616 u64 logical = sbio->logical + ix * PAGE_SIZE;
252 u64 length; 617 u64 length;
253 int i; 618 int i;
@@ -256,18 +621,36 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
256 621
257 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && 622 if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
258 (sbio->spag[ix].have_csum == 0)) { 623 (sbio->spag[ix].have_csum == 0)) {
624 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
625 if (!fixup)
626 goto uncorrectable;
627 fixup->sdev = sdev;
628 fixup->logical = logical;
629 fixup->root = fs_info->extent_root;
630 fixup->mirror_num = sbio->spag[ix].mirror_num;
259 /* 631 /*
260 * nodatasum, don't try to fix anything 632 * increment scrubs_running to prevent cancel requests from
261 * FIXME: we can do better, open the inode and trigger a 633 * completing as long as a fixup worker is running. we must also
262 * writeback 634 * increment scrubs_paused to prevent deadlocking on pause
635 * requests used for transactions commits (as the worker uses a
636 * transaction context). it is safe to regard the fixup worker
637 * as paused for all matters practical. effectively, we only
638 * avoid cancellation requests from completing.
263 */ 639 */
264 goto uncorrectable; 640 mutex_lock(&fs_info->scrub_lock);
641 atomic_inc(&fs_info->scrubs_running);
642 atomic_inc(&fs_info->scrubs_paused);
643 mutex_unlock(&fs_info->scrub_lock);
644 atomic_inc(&sdev->fixup_cnt);
645 fixup->work.func = scrub_fixup_nodatasum;
646 btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
647 return;
265 } 648 }
266 649
267 length = PAGE_SIZE; 650 length = PAGE_SIZE;
268 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, 651 ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
269 &multi, 0); 652 &bbio, 0);
270 if (ret || !multi || length < PAGE_SIZE) { 653 if (ret || !bbio || length < PAGE_SIZE) {
271 printk(KERN_ERR 654 printk(KERN_ERR
272 "scrub_fixup: btrfs_map_block failed us for %llu\n", 655 "scrub_fixup: btrfs_map_block failed us for %llu\n",
273 (unsigned long long)logical); 656 (unsigned long long)logical);
@@ -275,19 +658,19 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
275 return; 658 return;
276 } 659 }
277 660
278 if (multi->num_stripes == 1) 661 if (bbio->num_stripes == 1)
279 /* there aren't any replicas */ 662 /* there aren't any replicas */
280 goto uncorrectable; 663 goto uncorrectable;
281 664
282 /* 665 /*
283 * first find a good copy 666 * first find a good copy
284 */ 667 */
285 for (i = 0; i < multi->num_stripes; ++i) { 668 for (i = 0; i < bbio->num_stripes; ++i) {
286 if (i == sbio->spag[ix].mirror_num) 669 if (i + 1 == sbio->spag[ix].mirror_num)
287 continue; 670 continue;
288 671
289 if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, 672 if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
290 multi->stripes[i].physical >> 9, 673 bbio->stripes[i].physical >> 9,
291 sbio->bio->bi_io_vec[ix].bv_page)) { 674 sbio->bio->bi_io_vec[ix].bv_page)) {
292 /* I/O-error, this is not a good copy */ 675 /* I/O-error, this is not a good copy */
293 continue; 676 continue;
@@ -296,7 +679,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
296 if (scrub_fixup_check(sbio, ix) == 0) 679 if (scrub_fixup_check(sbio, ix) == 0)
297 break; 680 break;
298 } 681 }
299 if (i == multi->num_stripes) 682 if (i == bbio->num_stripes)
300 goto uncorrectable; 683 goto uncorrectable;
301 684
302 if (!sdev->readonly) { 685 if (!sdev->readonly) {
@@ -311,25 +694,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
311 } 694 }
312 } 695 }
313 696
314 kfree(multi); 697 kfree(bbio);
315 spin_lock(&sdev->stat_lock); 698 spin_lock(&sdev->stat_lock);
316 ++sdev->stat.corrected_errors; 699 ++sdev->stat.corrected_errors;
317 spin_unlock(&sdev->stat_lock); 700 spin_unlock(&sdev->stat_lock);
318 701
319 if (printk_ratelimit()) 702 printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
320 printk(KERN_ERR "btrfs: fixed up at %llu\n", 703 (unsigned long long)logical);
321 (unsigned long long)logical);
322 return; 704 return;
323 705
324uncorrectable: 706uncorrectable:
325 kfree(multi); 707 kfree(bbio);
326 spin_lock(&sdev->stat_lock); 708 spin_lock(&sdev->stat_lock);
327 ++sdev->stat.uncorrectable_errors; 709 ++sdev->stat.uncorrectable_errors;
328 spin_unlock(&sdev->stat_lock); 710 spin_unlock(&sdev->stat_lock);
329 711
330 if (printk_ratelimit()) 712 printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
331 printk(KERN_ERR "btrfs: unable to fixup at %llu\n", 713 "logical %llu\n", (unsigned long long)logical);
332 (unsigned long long)logical);
333} 714}
334 715
335static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, 716static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
@@ -379,8 +760,14 @@ static void scrub_checksum(struct btrfs_work *work)
379 int ret; 760 int ret;
380 761
381 if (sbio->err) { 762 if (sbio->err) {
763 ret = 0;
382 for (i = 0; i < sbio->count; ++i) 764 for (i = 0; i < sbio->count; ++i)
383 scrub_recheck_error(sbio, i); 765 ret |= scrub_recheck_error(sbio, i);
766 if (!ret) {
767 spin_lock(&sdev->stat_lock);
768 ++sdev->stat.unverified_errors;
769 spin_unlock(&sdev->stat_lock);
770 }
384 771
385 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); 772 sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
386 sbio->bio->bi_flags |= 1 << BIO_UPTODATE; 773 sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -393,10 +780,6 @@ static void scrub_checksum(struct btrfs_work *work)
393 bi->bv_offset = 0; 780 bi->bv_offset = 0;
394 bi->bv_len = PAGE_SIZE; 781 bi->bv_len = PAGE_SIZE;
395 } 782 }
396
397 spin_lock(&sdev->stat_lock);
398 ++sdev->stat.read_errors;
399 spin_unlock(&sdev->stat_lock);
400 goto out; 783 goto out;
401 } 784 }
402 for (i = 0; i < sbio->count; ++i) { 785 for (i = 0; i < sbio->count; ++i) {
@@ -417,8 +800,14 @@ static void scrub_checksum(struct btrfs_work *work)
417 WARN_ON(1); 800 WARN_ON(1);
418 } 801 }
419 kunmap_atomic(buffer, KM_USER0); 802 kunmap_atomic(buffer, KM_USER0);
420 if (ret) 803 if (ret) {
421 scrub_recheck_error(sbio, i); 804 ret = scrub_recheck_error(sbio, i);
805 if (!ret) {
806 spin_lock(&sdev->stat_lock);
807 ++sdev->stat.unverified_errors;
808 spin_unlock(&sdev->stat_lock);
809 }
810 }
422 } 811 }
423 812
424out: 813out:
@@ -601,7 +990,7 @@ nomem:
601} 990}
602 991
603static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, 992static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
604 u64 physical, u64 flags, u64 gen, u64 mirror_num, 993 u64 physical, u64 flags, u64 gen, int mirror_num,
605 u8 *csum, int force) 994 u8 *csum, int force)
606{ 995{
607 struct scrub_bio *sbio; 996 struct scrub_bio *sbio;
@@ -698,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
698 1087
699/* scrub extent tries to collect up to 64 kB for each bio */ 1088/* scrub extent tries to collect up to 64 kB for each bio */
700static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, 1089static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
701 u64 physical, u64 flags, u64 gen, u64 mirror_num) 1090 u64 physical, u64 flags, u64 gen, int mirror_num)
702{ 1091{
703 int ret; 1092 int ret;
704 u8 csum[BTRFS_CSUM_SIZE]; 1093 u8 csum[BTRFS_CSUM_SIZE];
@@ -743,7 +1132,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
743 u64 physical; 1132 u64 physical;
744 u64 logical; 1133 u64 logical;
745 u64 generation; 1134 u64 generation;
746 u64 mirror_num; 1135 int mirror_num;
747 struct reada_control *reada1; 1136 struct reada_control *reada1;
748 struct reada_control *reada2; 1137 struct reada_control *reada2;
749 struct btrfs_key key_start; 1138 struct btrfs_key key_start;
@@ -758,21 +1147,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
758 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 1147 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
759 offset = map->stripe_len * num; 1148 offset = map->stripe_len * num;
760 increment = map->stripe_len * map->num_stripes; 1149 increment = map->stripe_len * map->num_stripes;
761 mirror_num = 0; 1150 mirror_num = 1;
762 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 1151 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
763 int factor = map->num_stripes / map->sub_stripes; 1152 int factor = map->num_stripes / map->sub_stripes;
764 offset = map->stripe_len * (num / map->sub_stripes); 1153 offset = map->stripe_len * (num / map->sub_stripes);
765 increment = map->stripe_len * factor; 1154 increment = map->stripe_len * factor;
766 mirror_num = num % map->sub_stripes; 1155 mirror_num = num % map->sub_stripes + 1;
767 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { 1156 } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
768 increment = map->stripe_len; 1157 increment = map->stripe_len;
769 mirror_num = num % map->num_stripes; 1158 mirror_num = num % map->num_stripes + 1;
770 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 1159 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
771 increment = map->stripe_len; 1160 increment = map->stripe_len;
772 mirror_num = num % map->num_stripes; 1161 mirror_num = num % map->num_stripes + 1;
773 } else { 1162 } else {
774 increment = map->stripe_len; 1163 increment = map->stripe_len;
775 mirror_num = 0; 1164 mirror_num = 1;
776 } 1165 }
777 1166
778 path = btrfs_alloc_path(); 1167 path = btrfs_alloc_path();
@@ -1241,10 +1630,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
1241 ret = scrub_enumerate_chunks(sdev, start, end); 1630 ret = scrub_enumerate_chunks(sdev, start, end);
1242 1631
1243 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); 1632 wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
1244
1245 atomic_dec(&fs_info->scrubs_running); 1633 atomic_dec(&fs_info->scrubs_running);
1246 wake_up(&fs_info->scrub_pause_wait); 1634 wake_up(&fs_info->scrub_pause_wait);
1247 1635
1636 wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
1637
1248 if (progress) 1638 if (progress)
1249 memcpy(progress, &sdev->stat, sizeof(*progress)); 1639 memcpy(progress, &sdev->stat, sizeof(*progress));
1250 1640
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f1685a2b45c8..f8e2943101a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2880,7 +2880,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
2880 2880
2881static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 2881static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2882 u64 logical, u64 *length, 2882 u64 logical, u64 *length,
2883 struct btrfs_multi_bio **multi_ret, 2883 struct btrfs_bio **bbio_ret,
2884 int mirror_num) 2884 int mirror_num)
2885{ 2885{
2886 struct extent_map *em; 2886 struct extent_map *em;
@@ -2898,18 +2898,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
2898 int i; 2898 int i;
2899 int num_stripes; 2899 int num_stripes;
2900 int max_errors = 0; 2900 int max_errors = 0;
2901 struct btrfs_multi_bio *multi = NULL; 2901 struct btrfs_bio *bbio = NULL;
2902 2902
2903 if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD))) 2903 if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
2904 stripes_allocated = 1; 2904 stripes_allocated = 1;
2905again: 2905again:
2906 if (multi_ret) { 2906 if (bbio_ret) {
2907 multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), 2907 bbio = kzalloc(btrfs_bio_size(stripes_allocated),
2908 GFP_NOFS); 2908 GFP_NOFS);
2909 if (!multi) 2909 if (!bbio)
2910 return -ENOMEM; 2910 return -ENOMEM;
2911 2911
2912 atomic_set(&multi->error, 0); 2912 atomic_set(&bbio->error, 0);
2913 } 2913 }
2914 2914
2915 read_lock(&em_tree->lock); 2915 read_lock(&em_tree->lock);
@@ -2930,7 +2930,7 @@ again:
2930 if (mirror_num > map->num_stripes) 2930 if (mirror_num > map->num_stripes)
2931 mirror_num = 0; 2931 mirror_num = 0;
2932 2932
2933 /* if our multi bio struct is too small, back off and try again */ 2933 /* if our btrfs_bio struct is too small, back off and try again */
2934 if (rw & REQ_WRITE) { 2934 if (rw & REQ_WRITE) {
2935 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | 2935 if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
2936 BTRFS_BLOCK_GROUP_DUP)) { 2936 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2949,11 +2949,11 @@ again:
2949 stripes_required = map->num_stripes; 2949 stripes_required = map->num_stripes;
2950 } 2950 }
2951 } 2951 }
2952 if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) && 2952 if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
2953 stripes_allocated < stripes_required) { 2953 stripes_allocated < stripes_required) {
2954 stripes_allocated = map->num_stripes; 2954 stripes_allocated = map->num_stripes;
2955 free_extent_map(em); 2955 free_extent_map(em);
2956 kfree(multi); 2956 kfree(bbio);
2957 goto again; 2957 goto again;
2958 } 2958 }
2959 stripe_nr = offset; 2959 stripe_nr = offset;
@@ -2982,7 +2982,7 @@ again:
2982 *length = em->len - offset; 2982 *length = em->len - offset;
2983 } 2983 }
2984 2984
2985 if (!multi_ret) 2985 if (!bbio_ret)
2986 goto out; 2986 goto out;
2987 2987
2988 num_stripes = 1; 2988 num_stripes = 1;
@@ -3007,13 +3007,17 @@ again:
3007 stripe_index = find_live_mirror(map, 0, 3007 stripe_index = find_live_mirror(map, 0,
3008 map->num_stripes, 3008 map->num_stripes,
3009 current->pid % map->num_stripes); 3009 current->pid % map->num_stripes);
3010 mirror_num = stripe_index + 1;
3010 } 3011 }
3011 3012
3012 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { 3013 } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3013 if (rw & (REQ_WRITE | REQ_DISCARD)) 3014 if (rw & (REQ_WRITE | REQ_DISCARD)) {
3014 num_stripes = map->num_stripes; 3015 num_stripes = map->num_stripes;
3015 else if (mirror_num) 3016 } else if (mirror_num) {
3016 stripe_index = mirror_num - 1; 3017 stripe_index = mirror_num - 1;
3018 } else {
3019 mirror_num = 1;
3020 }
3017 3021
3018 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3022 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3019 int factor = map->num_stripes / map->sub_stripes; 3023 int factor = map->num_stripes / map->sub_stripes;
@@ -3033,6 +3037,7 @@ again:
3033 stripe_index = find_live_mirror(map, stripe_index, 3037 stripe_index = find_live_mirror(map, stripe_index,
3034 map->sub_stripes, stripe_index + 3038 map->sub_stripes, stripe_index +
3035 current->pid % map->sub_stripes); 3039 current->pid % map->sub_stripes);
3040 mirror_num = stripe_index + 1;
3036 } 3041 }
3037 } else { 3042 } else {
3038 /* 3043 /*
@@ -3041,15 +3046,16 @@ again:
3041 * stripe_index is the number of our device in the stripe array 3046 * stripe_index is the number of our device in the stripe array
3042 */ 3047 */
3043 stripe_index = do_div(stripe_nr, map->num_stripes); 3048 stripe_index = do_div(stripe_nr, map->num_stripes);
3049 mirror_num = stripe_index + 1;
3044 } 3050 }
3045 BUG_ON(stripe_index >= map->num_stripes); 3051 BUG_ON(stripe_index >= map->num_stripes);
3046 3052
3047 if (rw & REQ_DISCARD) { 3053 if (rw & REQ_DISCARD) {
3048 for (i = 0; i < num_stripes; i++) { 3054 for (i = 0; i < num_stripes; i++) {
3049 multi->stripes[i].physical = 3055 bbio->stripes[i].physical =
3050 map->stripes[stripe_index].physical + 3056 map->stripes[stripe_index].physical +
3051 stripe_offset + stripe_nr * map->stripe_len; 3057 stripe_offset + stripe_nr * map->stripe_len;
3052 multi->stripes[i].dev = map->stripes[stripe_index].dev; 3058 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
3053 3059
3054 if (map->type & BTRFS_BLOCK_GROUP_RAID0) { 3060 if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3055 u64 stripes; 3061 u64 stripes;
@@ -3070,16 +3076,16 @@ again:
3070 } 3076 }
3071 stripes = stripe_nr_end - 1 - j; 3077 stripes = stripe_nr_end - 1 - j;
3072 do_div(stripes, map->num_stripes); 3078 do_div(stripes, map->num_stripes);
3073 multi->stripes[i].length = map->stripe_len * 3079 bbio->stripes[i].length = map->stripe_len *
3074 (stripes - stripe_nr + 1); 3080 (stripes - stripe_nr + 1);
3075 3081
3076 if (i == 0) { 3082 if (i == 0) {
3077 multi->stripes[i].length -= 3083 bbio->stripes[i].length -=
3078 stripe_offset; 3084 stripe_offset;
3079 stripe_offset = 0; 3085 stripe_offset = 0;
3080 } 3086 }
3081 if (stripe_index == last_stripe) 3087 if (stripe_index == last_stripe)
3082 multi->stripes[i].length -= 3088 bbio->stripes[i].length -=
3083 stripe_end_offset; 3089 stripe_end_offset;
3084 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { 3090 } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3085 u64 stripes; 3091 u64 stripes;
@@ -3104,11 +3110,11 @@ again:
3104 } 3110 }
3105 stripes = stripe_nr_end - 1 - j; 3111 stripes = stripe_nr_end - 1 - j;
3106 do_div(stripes, factor); 3112 do_div(stripes, factor);
3107 multi->stripes[i].length = map->stripe_len * 3113 bbio->stripes[i].length = map->stripe_len *
3108 (stripes - stripe_nr + 1); 3114 (stripes - stripe_nr + 1);
3109 3115
3110 if (i < map->sub_stripes) { 3116 if (i < map->sub_stripes) {
3111 multi->stripes[i].length -= 3117 bbio->stripes[i].length -=
3112 stripe_offset; 3118 stripe_offset;
3113 if (i == map->sub_stripes - 1) 3119 if (i == map->sub_stripes - 1)
3114 stripe_offset = 0; 3120 stripe_offset = 0;
@@ -3116,11 +3122,11 @@ again:
3116 if (stripe_index >= last_stripe && 3122 if (stripe_index >= last_stripe &&
3117 stripe_index <= (last_stripe + 3123 stripe_index <= (last_stripe +
3118 map->sub_stripes - 1)) { 3124 map->sub_stripes - 1)) {
3119 multi->stripes[i].length -= 3125 bbio->stripes[i].length -=
3120 stripe_end_offset; 3126 stripe_end_offset;
3121 } 3127 }
3122 } else 3128 } else
3123 multi->stripes[i].length = *length; 3129 bbio->stripes[i].length = *length;
3124 3130
3125 stripe_index++; 3131 stripe_index++;
3126 if (stripe_index == map->num_stripes) { 3132 if (stripe_index == map->num_stripes) {
@@ -3131,19 +3137,20 @@ again:
3131 } 3137 }
3132 } else { 3138 } else {
3133 for (i = 0; i < num_stripes; i++) { 3139 for (i = 0; i < num_stripes; i++) {
3134 multi->stripes[i].physical = 3140 bbio->stripes[i].physical =
3135 map->stripes[stripe_index].physical + 3141 map->stripes[stripe_index].physical +
3136 stripe_offset + 3142 stripe_offset +
3137 stripe_nr * map->stripe_len; 3143 stripe_nr * map->stripe_len;
3138 multi->stripes[i].dev = 3144 bbio->stripes[i].dev =
3139 map->stripes[stripe_index].dev; 3145 map->stripes[stripe_index].dev;
3140 stripe_index++; 3146 stripe_index++;
3141 } 3147 }
3142 } 3148 }
3143 if (multi_ret) { 3149 if (bbio_ret) {
3144 *multi_ret = multi; 3150 *bbio_ret = bbio;
3145 multi->num_stripes = num_stripes; 3151 bbio->num_stripes = num_stripes;
3146 multi->max_errors = max_errors; 3152 bbio->max_errors = max_errors;
3153 bbio->mirror_num = mirror_num;
3147 } 3154 }
3148out: 3155out:
3149 free_extent_map(em); 3156 free_extent_map(em);
@@ -3152,9 +3159,9 @@ out:
3152 3159
3153int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 3160int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
3154 u64 logical, u64 *length, 3161 u64 logical, u64 *length,
3155 struct btrfs_multi_bio **multi_ret, int mirror_num) 3162 struct btrfs_bio **bbio_ret, int mirror_num)
3156{ 3163{
3157 return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, 3164 return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
3158 mirror_num); 3165 mirror_num);
3159} 3166}
3160 3167
@@ -3223,28 +3230,30 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
3223 return 0; 3230 return 0;
3224} 3231}
3225 3232
3226static void end_bio_multi_stripe(struct bio *bio, int err) 3233static void btrfs_end_bio(struct bio *bio, int err)
3227{ 3234{
3228 struct btrfs_multi_bio *multi = bio->bi_private; 3235 struct btrfs_bio *bbio = bio->bi_private;
3229 int is_orig_bio = 0; 3236 int is_orig_bio = 0;
3230 3237
3231 if (err) 3238 if (err)
3232 atomic_inc(&multi->error); 3239 atomic_inc(&bbio->error);
3233 3240
3234 if (bio == multi->orig_bio) 3241 if (bio == bbio->orig_bio)
3235 is_orig_bio = 1; 3242 is_orig_bio = 1;
3236 3243
3237 if (atomic_dec_and_test(&multi->stripes_pending)) { 3244 if (atomic_dec_and_test(&bbio->stripes_pending)) {
3238 if (!is_orig_bio) { 3245 if (!is_orig_bio) {
3239 bio_put(bio); 3246 bio_put(bio);
3240 bio = multi->orig_bio; 3247 bio = bbio->orig_bio;
3241 } 3248 }
3242 bio->bi_private = multi->private; 3249 bio->bi_private = bbio->private;
3243 bio->bi_end_io = multi->end_io; 3250 bio->bi_end_io = bbio->end_io;
3251 bio->bi_bdev = (struct block_device *)
3252 (unsigned long)bbio->mirror_num;
3244 /* only send an error to the higher layers if it is 3253 /* only send an error to the higher layers if it is
3245 * beyond the tolerance of the multi-bio 3254 * beyond the tolerance of the multi-bio
3246 */ 3255 */
3247 if (atomic_read(&multi->error) > multi->max_errors) { 3256 if (atomic_read(&bbio->error) > bbio->max_errors) {
3248 err = -EIO; 3257 err = -EIO;
3249 } else if (err) { 3258 } else if (err) {
3250 /* 3259 /*
@@ -3254,7 +3263,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
3254 set_bit(BIO_UPTODATE, &bio->bi_flags); 3263 set_bit(BIO_UPTODATE, &bio->bi_flags);
3255 err = 0; 3264 err = 0;
3256 } 3265 }
3257 kfree(multi); 3266 kfree(bbio);
3258 3267
3259 bio_endio(bio, err); 3268 bio_endio(bio, err);
3260 } else if (!is_orig_bio) { 3269 } else if (!is_orig_bio) {
@@ -3334,20 +3343,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3334 u64 logical = (u64)bio->bi_sector << 9; 3343 u64 logical = (u64)bio->bi_sector << 9;
3335 u64 length = 0; 3344 u64 length = 0;
3336 u64 map_length; 3345 u64 map_length;
3337 struct btrfs_multi_bio *multi = NULL;
3338 int ret; 3346 int ret;
3339 int dev_nr = 0; 3347 int dev_nr = 0;
3340 int total_devs = 1; 3348 int total_devs = 1;
3349 struct btrfs_bio *bbio = NULL;
3341 3350
3342 length = bio->bi_size; 3351 length = bio->bi_size;
3343 map_tree = &root->fs_info->mapping_tree; 3352 map_tree = &root->fs_info->mapping_tree;
3344 map_length = length; 3353 map_length = length;
3345 3354
3346 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, 3355 ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
3347 mirror_num); 3356 mirror_num);
3348 BUG_ON(ret); 3357 BUG_ON(ret);
3349 3358
3350 total_devs = multi->num_stripes; 3359 total_devs = bbio->num_stripes;
3351 if (map_length < length) { 3360 if (map_length < length) {
3352 printk(KERN_CRIT "mapping failed logical %llu bio len %llu " 3361 printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
3353 "len %llu\n", (unsigned long long)logical, 3362 "len %llu\n", (unsigned long long)logical,
@@ -3355,25 +3364,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3355 (unsigned long long)map_length); 3364 (unsigned long long)map_length);
3356 BUG(); 3365 BUG();
3357 } 3366 }
3358 multi->end_io = first_bio->bi_end_io; 3367
3359 multi->private = first_bio->bi_private; 3368 bbio->orig_bio = first_bio;
3360 multi->orig_bio = first_bio; 3369 bbio->private = first_bio->bi_private;
3361 atomic_set(&multi->stripes_pending, multi->num_stripes); 3370 bbio->end_io = first_bio->bi_end_io;
3371 atomic_set(&bbio->stripes_pending, bbio->num_stripes);
3362 3372
3363 while (dev_nr < total_devs) { 3373 while (dev_nr < total_devs) {
3364 if (total_devs > 1) { 3374 if (dev_nr < total_devs - 1) {
3365 if (dev_nr < total_devs - 1) { 3375 bio = bio_clone(first_bio, GFP_NOFS);
3366 bio = bio_clone(first_bio, GFP_NOFS); 3376 BUG_ON(!bio);
3367 BUG_ON(!bio); 3377 } else {
3368 } else { 3378 bio = first_bio;
3369 bio = first_bio;
3370 }
3371 bio->bi_private = multi;
3372 bio->bi_end_io = end_bio_multi_stripe;
3373 } 3379 }
3374 bio->bi_sector = multi->stripes[dev_nr].physical >> 9; 3380 bio->bi_private = bbio;
3375 dev = multi->stripes[dev_nr].dev; 3381 bio->bi_end_io = btrfs_end_bio;
3382 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
3383 dev = bbio->stripes[dev_nr].dev;
3376 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) { 3384 if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
3385 pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
3386 "(%s id %llu), size=%u\n", rw,
3387 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
3388 dev->name, dev->devid, bio->bi_size);
3377 bio->bi_bdev = dev->bdev; 3389 bio->bi_bdev = dev->bdev;
3378 if (async_submit) 3390 if (async_submit)
3379 schedule_bio(root, dev, rw, bio); 3391 schedule_bio(root, dev, rw, bio);
@@ -3386,8 +3398,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
3386 } 3398 }
3387 dev_nr++; 3399 dev_nr++;
3388 } 3400 }
3389 if (total_devs == 1)
3390 kfree(multi);
3391 return 0; 3401 return 0;
3392} 3402}
3393 3403
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 2a751246188a..ab5b1c49f352 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -144,7 +144,10 @@ struct btrfs_bio_stripe {
144 u64 length; /* only used for discard mappings */ 144 u64 length; /* only used for discard mappings */
145}; 145};
146 146
147struct btrfs_multi_bio { 147struct btrfs_bio;
148typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
149
150struct btrfs_bio {
148 atomic_t stripes_pending; 151 atomic_t stripes_pending;
149 bio_end_io_t *end_io; 152 bio_end_io_t *end_io;
150 struct bio *orig_bio; 153 struct bio *orig_bio;
@@ -152,6 +155,7 @@ struct btrfs_multi_bio {
152 atomic_t error; 155 atomic_t error;
153 int max_errors; 156 int max_errors;
154 int num_stripes; 157 int num_stripes;
158 int mirror_num;
155 struct btrfs_bio_stripe stripes[]; 159 struct btrfs_bio_stripe stripes[];
156}; 160};
157 161
@@ -179,7 +183,7 @@ struct map_lookup {
179int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, 183int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
180 u64 end, u64 *length); 184 u64 end, u64 *length);
181 185
182#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ 186#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
183 (sizeof(struct btrfs_bio_stripe) * (n))) 187 (sizeof(struct btrfs_bio_stripe) * (n)))
184 188
185int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, 189int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -188,7 +192,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
188 u64 chunk_offset, u64 start, u64 num_bytes); 192 u64 chunk_offset, u64 start, u64 num_bytes);
189int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, 193int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
190 u64 logical, u64 *length, 194 u64 logical, u64 *length,
191 struct btrfs_multi_bio **multi_ret, int mirror_num); 195 struct btrfs_bio **bbio_ret, int mirror_num);
192int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, 196int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
193 u64 chunk_start, u64 physical, u64 devid, 197 u64 chunk_start, u64 physical, u64 devid,
194 u64 **logical, int *naddrs, int *stripe_len); 198 u64 **logical, int *naddrs, int *stripe_len);