diff options
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r-- | fs/btrfs/scrub.c | 591 |
1 files changed, 485 insertions, 106 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a8d03d5efb5..ed11d3866af 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -17,10 +17,14 @@ | |||
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/blkdev.h> | 19 | #include <linux/blkdev.h> |
20 | #include <linux/ratelimit.h> | ||
20 | #include "ctree.h" | 21 | #include "ctree.h" |
21 | #include "volumes.h" | 22 | #include "volumes.h" |
22 | #include "disk-io.h" | 23 | #include "disk-io.h" |
23 | #include "ordered-data.h" | 24 | #include "ordered-data.h" |
25 | #include "transaction.h" | ||
26 | #include "backref.h" | ||
27 | #include "extent_io.h" | ||
24 | 28 | ||
25 | /* | 29 | /* |
26 | * This is only the first step towards a full-features scrub. It reads all | 30 | * This is only the first step towards a full-features scrub. It reads all |
@@ -29,15 +33,12 @@ | |||
29 | * any can be found. | 33 | * any can be found. |
30 | * | 34 | * |
31 | * Future enhancements: | 35 | * Future enhancements: |
32 | * - To enhance the performance, better read-ahead strategies for the | ||
33 | * extent-tree can be employed. | ||
34 | * - In case an unrepairable extent is encountered, track which files are | 36 | * - In case an unrepairable extent is encountered, track which files are |
35 | * affected and report them | 37 | * affected and report them |
36 | * - In case of a read error on files with nodatasum, map the file and read | 38 | * - In case of a read error on files with nodatasum, map the file and read |
37 | * the extent to trigger a writeback of the good copy | 39 | * the extent to trigger a writeback of the good copy |
38 | * - track and record media errors, throw out bad devices | 40 | * - track and record media errors, throw out bad devices |
39 | * - add a mode to also read unallocated space | 41 | * - add a mode to also read unallocated space |
40 | * - make the prefetch cancellable | ||
41 | */ | 42 | */ |
42 | 43 | ||
43 | struct scrub_bio; | 44 | struct scrub_bio; |
@@ -63,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix); | |||
63 | struct scrub_page { | 64 | struct scrub_page { |
64 | u64 flags; /* extent flags */ | 65 | u64 flags; /* extent flags */ |
65 | u64 generation; | 66 | u64 generation; |
66 | u64 mirror_num; | 67 | int mirror_num; |
67 | int have_csum; | 68 | int have_csum; |
68 | u8 csum[BTRFS_CSUM_SIZE]; | 69 | u8 csum[BTRFS_CSUM_SIZE]; |
69 | }; | 70 | }; |
@@ -87,6 +88,7 @@ struct scrub_dev { | |||
87 | int first_free; | 88 | int first_free; |
88 | int curr; | 89 | int curr; |
89 | atomic_t in_flight; | 90 | atomic_t in_flight; |
91 | atomic_t fixup_cnt; | ||
90 | spinlock_t list_lock; | 92 | spinlock_t list_lock; |
91 | wait_queue_head_t list_wait; | 93 | wait_queue_head_t list_wait; |
92 | u16 csum_size; | 94 | u16 csum_size; |
@@ -100,6 +102,27 @@ struct scrub_dev { | |||
100 | spinlock_t stat_lock; | 102 | spinlock_t stat_lock; |
101 | }; | 103 | }; |
102 | 104 | ||
105 | struct scrub_fixup_nodatasum { | ||
106 | struct scrub_dev *sdev; | ||
107 | u64 logical; | ||
108 | struct btrfs_root *root; | ||
109 | struct btrfs_work work; | ||
110 | int mirror_num; | ||
111 | }; | ||
112 | |||
113 | struct scrub_warning { | ||
114 | struct btrfs_path *path; | ||
115 | u64 extent_item_size; | ||
116 | char *scratch_buf; | ||
117 | char *msg_buf; | ||
118 | const char *errstr; | ||
119 | sector_t sector; | ||
120 | u64 logical; | ||
121 | struct btrfs_device *dev; | ||
122 | int msg_bufsize; | ||
123 | int scratch_bufsize; | ||
124 | }; | ||
125 | |||
103 | static void scrub_free_csums(struct scrub_dev *sdev) | 126 | static void scrub_free_csums(struct scrub_dev *sdev) |
104 | { | 127 | { |
105 | while (!list_empty(&sdev->csum_list)) { | 128 | while (!list_empty(&sdev->csum_list)) { |
@@ -175,14 +198,15 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) | |||
175 | 198 | ||
176 | if (i != SCRUB_BIOS_PER_DEV-1) | 199 | if (i != SCRUB_BIOS_PER_DEV-1) |
177 | sdev->bios[i]->next_free = i + 1; | 200 | sdev->bios[i]->next_free = i + 1; |
178 | else | 201 | else |
179 | sdev->bios[i]->next_free = -1; | 202 | sdev->bios[i]->next_free = -1; |
180 | } | 203 | } |
181 | sdev->first_free = 0; | 204 | sdev->first_free = 0; |
182 | sdev->curr = -1; | 205 | sdev->curr = -1; |
183 | atomic_set(&sdev->in_flight, 0); | 206 | atomic_set(&sdev->in_flight, 0); |
207 | atomic_set(&sdev->fixup_cnt, 0); | ||
184 | atomic_set(&sdev->cancel_req, 0); | 208 | atomic_set(&sdev->cancel_req, 0); |
185 | sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy); | 209 | sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); |
186 | INIT_LIST_HEAD(&sdev->csum_list); | 210 | INIT_LIST_HEAD(&sdev->csum_list); |
187 | 211 | ||
188 | spin_lock_init(&sdev->list_lock); | 212 | spin_lock_init(&sdev->list_lock); |
@@ -195,24 +219,361 @@ nomem: | |||
195 | return ERR_PTR(-ENOMEM); | 219 | return ERR_PTR(-ENOMEM); |
196 | } | 220 | } |
197 | 221 | ||
222 | static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) | ||
223 | { | ||
224 | u64 isize; | ||
225 | u32 nlink; | ||
226 | int ret; | ||
227 | int i; | ||
228 | struct extent_buffer *eb; | ||
229 | struct btrfs_inode_item *inode_item; | ||
230 | struct scrub_warning *swarn = ctx; | ||
231 | struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; | ||
232 | struct inode_fs_paths *ipath = NULL; | ||
233 | struct btrfs_root *local_root; | ||
234 | struct btrfs_key root_key; | ||
235 | |||
236 | root_key.objectid = root; | ||
237 | root_key.type = BTRFS_ROOT_ITEM_KEY; | ||
238 | root_key.offset = (u64)-1; | ||
239 | local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); | ||
240 | if (IS_ERR(local_root)) { | ||
241 | ret = PTR_ERR(local_root); | ||
242 | goto err; | ||
243 | } | ||
244 | |||
245 | ret = inode_item_info(inum, 0, local_root, swarn->path); | ||
246 | if (ret) { | ||
247 | btrfs_release_path(swarn->path); | ||
248 | goto err; | ||
249 | } | ||
250 | |||
251 | eb = swarn->path->nodes[0]; | ||
252 | inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], | ||
253 | struct btrfs_inode_item); | ||
254 | isize = btrfs_inode_size(eb, inode_item); | ||
255 | nlink = btrfs_inode_nlink(eb, inode_item); | ||
256 | btrfs_release_path(swarn->path); | ||
257 | |||
258 | ipath = init_ipath(4096, local_root, swarn->path); | ||
259 | ret = paths_from_inode(inum, ipath); | ||
260 | |||
261 | if (ret < 0) | ||
262 | goto err; | ||
263 | |||
264 | /* | ||
265 | * we deliberately ignore the bit ipath might have been too small to | ||
266 | * hold all of the paths here | ||
267 | */ | ||
268 | for (i = 0; i < ipath->fspath->elem_cnt; ++i) | ||
269 | printk(KERN_WARNING "btrfs: %s at logical %llu on dev " | ||
270 | "%s, sector %llu, root %llu, inode %llu, offset %llu, " | ||
271 | "length %llu, links %u (path: %s)\n", swarn->errstr, | ||
272 | swarn->logical, swarn->dev->name, | ||
273 | (unsigned long long)swarn->sector, root, inum, offset, | ||
274 | min(isize - offset, (u64)PAGE_SIZE), nlink, | ||
275 | (char *)ipath->fspath->val[i]); | ||
276 | |||
277 | free_ipath(ipath); | ||
278 | return 0; | ||
279 | |||
280 | err: | ||
281 | printk(KERN_WARNING "btrfs: %s at logical %llu on dev " | ||
282 | "%s, sector %llu, root %llu, inode %llu, offset %llu: path " | ||
283 | "resolving failed with ret=%d\n", swarn->errstr, | ||
284 | swarn->logical, swarn->dev->name, | ||
285 | (unsigned long long)swarn->sector, root, inum, offset, ret); | ||
286 | |||
287 | free_ipath(ipath); | ||
288 | return 0; | ||
289 | } | ||
290 | |||
291 | static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, | ||
292 | int ix) | ||
293 | { | ||
294 | struct btrfs_device *dev = sbio->sdev->dev; | ||
295 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; | ||
296 | struct btrfs_path *path; | ||
297 | struct btrfs_key found_key; | ||
298 | struct extent_buffer *eb; | ||
299 | struct btrfs_extent_item *ei; | ||
300 | struct scrub_warning swarn; | ||
301 | u32 item_size; | ||
302 | int ret; | ||
303 | u64 ref_root; | ||
304 | u8 ref_level; | ||
305 | unsigned long ptr = 0; | ||
306 | const int bufsize = 4096; | ||
307 | u64 extent_offset; | ||
308 | |||
309 | path = btrfs_alloc_path(); | ||
310 | |||
311 | swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); | ||
312 | swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); | ||
313 | swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; | ||
314 | swarn.logical = sbio->logical + ix * PAGE_SIZE; | ||
315 | swarn.errstr = errstr; | ||
316 | swarn.dev = dev; | ||
317 | swarn.msg_bufsize = bufsize; | ||
318 | swarn.scratch_bufsize = bufsize; | ||
319 | |||
320 | if (!path || !swarn.scratch_buf || !swarn.msg_buf) | ||
321 | goto out; | ||
322 | |||
323 | ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); | ||
324 | if (ret < 0) | ||
325 | goto out; | ||
326 | |||
327 | extent_offset = swarn.logical - found_key.objectid; | ||
328 | swarn.extent_item_size = found_key.offset; | ||
329 | |||
330 | eb = path->nodes[0]; | ||
331 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | ||
332 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | ||
333 | |||
334 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | ||
335 | do { | ||
336 | ret = tree_backref_for_extent(&ptr, eb, ei, item_size, | ||
337 | &ref_root, &ref_level); | ||
338 | printk(KERN_WARNING "%s at logical %llu on dev %s, " | ||
339 | "sector %llu: metadata %s (level %d) in tree " | ||
340 | "%llu\n", errstr, swarn.logical, dev->name, | ||
341 | (unsigned long long)swarn.sector, | ||
342 | ref_level ? "node" : "leaf", | ||
343 | ret < 0 ? -1 : ref_level, | ||
344 | ret < 0 ? -1 : ref_root); | ||
345 | } while (ret != 1); | ||
346 | } else { | ||
347 | swarn.path = path; | ||
348 | iterate_extent_inodes(fs_info, path, found_key.objectid, | ||
349 | extent_offset, | ||
350 | scrub_print_warning_inode, &swarn); | ||
351 | } | ||
352 | |||
353 | out: | ||
354 | btrfs_free_path(path); | ||
355 | kfree(swarn.scratch_buf); | ||
356 | kfree(swarn.msg_buf); | ||
357 | } | ||
358 | |||
359 | static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) | ||
360 | { | ||
361 | struct page *page = NULL; | ||
362 | unsigned long index; | ||
363 | struct scrub_fixup_nodatasum *fixup = ctx; | ||
364 | int ret; | ||
365 | int corrected = 0; | ||
366 | struct btrfs_key key; | ||
367 | struct inode *inode = NULL; | ||
368 | u64 end = offset + PAGE_SIZE - 1; | ||
369 | struct btrfs_root *local_root; | ||
370 | |||
371 | key.objectid = root; | ||
372 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
373 | key.offset = (u64)-1; | ||
374 | local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); | ||
375 | if (IS_ERR(local_root)) | ||
376 | return PTR_ERR(local_root); | ||
377 | |||
378 | key.type = BTRFS_INODE_ITEM_KEY; | ||
379 | key.objectid = inum; | ||
380 | key.offset = 0; | ||
381 | inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); | ||
382 | if (IS_ERR(inode)) | ||
383 | return PTR_ERR(inode); | ||
384 | |||
385 | index = offset >> PAGE_CACHE_SHIFT; | ||
386 | |||
387 | page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); | ||
388 | if (!page) { | ||
389 | ret = -ENOMEM; | ||
390 | goto out; | ||
391 | } | ||
392 | |||
393 | if (PageUptodate(page)) { | ||
394 | struct btrfs_mapping_tree *map_tree; | ||
395 | if (PageDirty(page)) { | ||
396 | /* | ||
397 | * we need to write the data to the defect sector. the | ||
398 | * data that was in that sector is not in memory, | ||
399 | * because the page was modified. we must not write the | ||
400 | * modified page to that sector. | ||
401 | * | ||
402 | * TODO: what could be done here: wait for the delalloc | ||
403 | * runner to write out that page (might involve | ||
404 | * COW) and see whether the sector is still | ||
405 | * referenced afterwards. | ||
406 | * | ||
407 | * For the meantime, we'll treat this error | ||
408 | * incorrectable, although there is a chance that a | ||
409 | * later scrub will find the bad sector again and that | ||
410 | * there's no dirty page in memory, then. | ||
411 | */ | ||
412 | ret = -EIO; | ||
413 | goto out; | ||
414 | } | ||
415 | map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; | ||
416 | ret = repair_io_failure(map_tree, offset, PAGE_SIZE, | ||
417 | fixup->logical, page, | ||
418 | fixup->mirror_num); | ||
419 | unlock_page(page); | ||
420 | corrected = !ret; | ||
421 | } else { | ||
422 | /* | ||
423 | * we need to get good data first. the general readpage path | ||
424 | * will call repair_io_failure for us, we just have to make | ||
425 | * sure we read the bad mirror. | ||
426 | */ | ||
427 | ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, | ||
428 | EXTENT_DAMAGED, GFP_NOFS); | ||
429 | if (ret) { | ||
430 | /* set_extent_bits should give proper error */ | ||
431 | WARN_ON(ret > 0); | ||
432 | if (ret > 0) | ||
433 | ret = -EFAULT; | ||
434 | goto out; | ||
435 | } | ||
436 | |||
437 | ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, | ||
438 | btrfs_get_extent, | ||
439 | fixup->mirror_num); | ||
440 | wait_on_page_locked(page); | ||
441 | |||
442 | corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, | ||
443 | end, EXTENT_DAMAGED, 0, NULL); | ||
444 | if (!corrected) | ||
445 | clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, | ||
446 | EXTENT_DAMAGED, GFP_NOFS); | ||
447 | } | ||
448 | |||
449 | out: | ||
450 | if (page) | ||
451 | put_page(page); | ||
452 | if (inode) | ||
453 | iput(inode); | ||
454 | |||
455 | if (ret < 0) | ||
456 | return ret; | ||
457 | |||
458 | if (ret == 0 && corrected) { | ||
459 | /* | ||
460 | * we only need to call readpage for one of the inodes belonging | ||
461 | * to this extent. so make iterate_extent_inodes stop | ||
462 | */ | ||
463 | return 1; | ||
464 | } | ||
465 | |||
466 | return -EIO; | ||
467 | } | ||
468 | |||
469 | static void scrub_fixup_nodatasum(struct btrfs_work *work) | ||
470 | { | ||
471 | int ret; | ||
472 | struct scrub_fixup_nodatasum *fixup; | ||
473 | struct scrub_dev *sdev; | ||
474 | struct btrfs_trans_handle *trans = NULL; | ||
475 | struct btrfs_fs_info *fs_info; | ||
476 | struct btrfs_path *path; | ||
477 | int uncorrectable = 0; | ||
478 | |||
479 | fixup = container_of(work, struct scrub_fixup_nodatasum, work); | ||
480 | sdev = fixup->sdev; | ||
481 | fs_info = fixup->root->fs_info; | ||
482 | |||
483 | path = btrfs_alloc_path(); | ||
484 | if (!path) { | ||
485 | spin_lock(&sdev->stat_lock); | ||
486 | ++sdev->stat.malloc_errors; | ||
487 | spin_unlock(&sdev->stat_lock); | ||
488 | uncorrectable = 1; | ||
489 | goto out; | ||
490 | } | ||
491 | |||
492 | trans = btrfs_join_transaction(fixup->root); | ||
493 | if (IS_ERR(trans)) { | ||
494 | uncorrectable = 1; | ||
495 | goto out; | ||
496 | } | ||
497 | |||
498 | /* | ||
499 | * the idea is to trigger a regular read through the standard path. we | ||
500 | * read a page from the (failed) logical address by specifying the | ||
501 | * corresponding copynum of the failed sector. thus, that readpage is | ||
502 | * expected to fail. | ||
503 | * that is the point where on-the-fly error correction will kick in | ||
504 | * (once it's finished) and rewrite the failed sector if a good copy | ||
505 | * can be found. | ||
506 | */ | ||
507 | ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, | ||
508 | path, scrub_fixup_readpage, | ||
509 | fixup); | ||
510 | if (ret < 0) { | ||
511 | uncorrectable = 1; | ||
512 | goto out; | ||
513 | } | ||
514 | WARN_ON(ret != 1); | ||
515 | |||
516 | spin_lock(&sdev->stat_lock); | ||
517 | ++sdev->stat.corrected_errors; | ||
518 | spin_unlock(&sdev->stat_lock); | ||
519 | |||
520 | out: | ||
521 | if (trans && !IS_ERR(trans)) | ||
522 | btrfs_end_transaction(trans, fixup->root); | ||
523 | if (uncorrectable) { | ||
524 | spin_lock(&sdev->stat_lock); | ||
525 | ++sdev->stat.uncorrectable_errors; | ||
526 | spin_unlock(&sdev->stat_lock); | ||
527 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup " | ||
528 | "(nodatasum) error at logical %llu\n", | ||
529 | fixup->logical); | ||
530 | } | ||
531 | |||
532 | btrfs_free_path(path); | ||
533 | kfree(fixup); | ||
534 | |||
535 | /* see caller why we're pretending to be paused in the scrub counters */ | ||
536 | mutex_lock(&fs_info->scrub_lock); | ||
537 | atomic_dec(&fs_info->scrubs_running); | ||
538 | atomic_dec(&fs_info->scrubs_paused); | ||
539 | mutex_unlock(&fs_info->scrub_lock); | ||
540 | atomic_dec(&sdev->fixup_cnt); | ||
541 | wake_up(&fs_info->scrub_pause_wait); | ||
542 | wake_up(&sdev->list_wait); | ||
543 | } | ||
544 | |||
198 | /* | 545 | /* |
199 | * scrub_recheck_error gets called when either verification of the page | 546 | * scrub_recheck_error gets called when either verification of the page |
200 | * failed or the bio failed to read, e.g. with EIO. In the latter case, | 547 | * failed or the bio failed to read, e.g. with EIO. In the latter case, |
201 | * recheck_error gets called for every page in the bio, even though only | 548 | * recheck_error gets called for every page in the bio, even though only |
202 | * one may be bad | 549 | * one may be bad |
203 | */ | 550 | */ |
204 | static void scrub_recheck_error(struct scrub_bio *sbio, int ix) | 551 | static int scrub_recheck_error(struct scrub_bio *sbio, int ix) |
205 | { | 552 | { |
553 | struct scrub_dev *sdev = sbio->sdev; | ||
554 | u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; | ||
555 | static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
556 | DEFAULT_RATELIMIT_BURST); | ||
557 | |||
206 | if (sbio->err) { | 558 | if (sbio->err) { |
207 | if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, | 559 | if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, |
208 | (sbio->physical + ix * PAGE_SIZE) >> 9, | ||
209 | sbio->bio->bi_io_vec[ix].bv_page) == 0) { | 560 | sbio->bio->bi_io_vec[ix].bv_page) == 0) { |
210 | if (scrub_fixup_check(sbio, ix) == 0) | 561 | if (scrub_fixup_check(sbio, ix) == 0) |
211 | return; | 562 | return 0; |
212 | } | 563 | } |
564 | if (__ratelimit(&_rs)) | ||
565 | scrub_print_warning("i/o error", sbio, ix); | ||
566 | } else { | ||
567 | if (__ratelimit(&_rs)) | ||
568 | scrub_print_warning("checksum error", sbio, ix); | ||
213 | } | 569 | } |
214 | 570 | ||
571 | spin_lock(&sdev->stat_lock); | ||
572 | ++sdev->stat.read_errors; | ||
573 | spin_unlock(&sdev->stat_lock); | ||
574 | |||
215 | scrub_fixup(sbio, ix); | 575 | scrub_fixup(sbio, ix); |
576 | return 1; | ||
216 | } | 577 | } |
217 | 578 | ||
218 | static int scrub_fixup_check(struct scrub_bio *sbio, int ix) | 579 | static int scrub_fixup_check(struct scrub_bio *sbio, int ix) |
@@ -250,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
250 | struct scrub_dev *sdev = sbio->sdev; | 611 | struct scrub_dev *sdev = sbio->sdev; |
251 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | 612 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; |
252 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 613 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
253 | struct btrfs_multi_bio *multi = NULL; | 614 | struct btrfs_bio *bbio = NULL; |
615 | struct scrub_fixup_nodatasum *fixup; | ||
254 | u64 logical = sbio->logical + ix * PAGE_SIZE; | 616 | u64 logical = sbio->logical + ix * PAGE_SIZE; |
255 | u64 length; | 617 | u64 length; |
256 | int i; | 618 | int i; |
@@ -259,38 +621,57 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
259 | 621 | ||
260 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && | 622 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && |
261 | (sbio->spag[ix].have_csum == 0)) { | 623 | (sbio->spag[ix].have_csum == 0)) { |
624 | fixup = kzalloc(sizeof(*fixup), GFP_NOFS); | ||
625 | if (!fixup) | ||
626 | goto uncorrectable; | ||
627 | fixup->sdev = sdev; | ||
628 | fixup->logical = logical; | ||
629 | fixup->root = fs_info->extent_root; | ||
630 | fixup->mirror_num = sbio->spag[ix].mirror_num; | ||
262 | /* | 631 | /* |
263 | * nodatasum, don't try to fix anything | 632 | * increment scrubs_running to prevent cancel requests from |
264 | * FIXME: we can do better, open the inode and trigger a | 633 | * completing as long as a fixup worker is running. we must also |
265 | * writeback | 634 | * increment scrubs_paused to prevent deadlocking on pause |
635 | * requests used for transactions commits (as the worker uses a | ||
636 | * transaction context). it is safe to regard the fixup worker | ||
637 | * as paused for all matters practical. effectively, we only | ||
638 | * avoid cancellation requests from completing. | ||
266 | */ | 639 | */ |
267 | goto uncorrectable; | 640 | mutex_lock(&fs_info->scrub_lock); |
641 | atomic_inc(&fs_info->scrubs_running); | ||
642 | atomic_inc(&fs_info->scrubs_paused); | ||
643 | mutex_unlock(&fs_info->scrub_lock); | ||
644 | atomic_inc(&sdev->fixup_cnt); | ||
645 | fixup->work.func = scrub_fixup_nodatasum; | ||
646 | btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); | ||
647 | return; | ||
268 | } | 648 | } |
269 | 649 | ||
270 | length = PAGE_SIZE; | 650 | length = PAGE_SIZE; |
271 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, | 651 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, |
272 | &multi, 0); | 652 | &bbio, 0); |
273 | if (ret || !multi || length < PAGE_SIZE) { | 653 | if (ret || !bbio || length < PAGE_SIZE) { |
274 | printk(KERN_ERR | 654 | printk(KERN_ERR |
275 | "scrub_fixup: btrfs_map_block failed us for %llu\n", | 655 | "scrub_fixup: btrfs_map_block failed us for %llu\n", |
276 | (unsigned long long)logical); | 656 | (unsigned long long)logical); |
277 | WARN_ON(1); | 657 | WARN_ON(1); |
658 | kfree(bbio); | ||
278 | return; | 659 | return; |
279 | } | 660 | } |
280 | 661 | ||
281 | if (multi->num_stripes == 1) | 662 | if (bbio->num_stripes == 1) |
282 | /* there aren't any replicas */ | 663 | /* there aren't any replicas */ |
283 | goto uncorrectable; | 664 | goto uncorrectable; |
284 | 665 | ||
285 | /* | 666 | /* |
286 | * first find a good copy | 667 | * first find a good copy |
287 | */ | 668 | */ |
288 | for (i = 0; i < multi->num_stripes; ++i) { | 669 | for (i = 0; i < bbio->num_stripes; ++i) { |
289 | if (i == sbio->spag[ix].mirror_num) | 670 | if (i + 1 == sbio->spag[ix].mirror_num) |
290 | continue; | 671 | continue; |
291 | 672 | ||
292 | if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, | 673 | if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, |
293 | multi->stripes[i].physical >> 9, | 674 | bbio->stripes[i].physical >> 9, |
294 | sbio->bio->bi_io_vec[ix].bv_page)) { | 675 | sbio->bio->bi_io_vec[ix].bv_page)) { |
295 | /* I/O-error, this is not a good copy */ | 676 | /* I/O-error, this is not a good copy */ |
296 | continue; | 677 | continue; |
@@ -299,7 +680,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
299 | if (scrub_fixup_check(sbio, ix) == 0) | 680 | if (scrub_fixup_check(sbio, ix) == 0) |
300 | break; | 681 | break; |
301 | } | 682 | } |
302 | if (i == multi->num_stripes) | 683 | if (i == bbio->num_stripes) |
303 | goto uncorrectable; | 684 | goto uncorrectable; |
304 | 685 | ||
305 | if (!sdev->readonly) { | 686 | if (!sdev->readonly) { |
@@ -314,25 +695,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
314 | } | 695 | } |
315 | } | 696 | } |
316 | 697 | ||
317 | kfree(multi); | 698 | kfree(bbio); |
318 | spin_lock(&sdev->stat_lock); | 699 | spin_lock(&sdev->stat_lock); |
319 | ++sdev->stat.corrected_errors; | 700 | ++sdev->stat.corrected_errors; |
320 | spin_unlock(&sdev->stat_lock); | 701 | spin_unlock(&sdev->stat_lock); |
321 | 702 | ||
322 | if (printk_ratelimit()) | 703 | printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", |
323 | printk(KERN_ERR "btrfs: fixed up at %llu\n", | 704 | (unsigned long long)logical); |
324 | (unsigned long long)logical); | ||
325 | return; | 705 | return; |
326 | 706 | ||
327 | uncorrectable: | 707 | uncorrectable: |
328 | kfree(multi); | 708 | kfree(bbio); |
329 | spin_lock(&sdev->stat_lock); | 709 | spin_lock(&sdev->stat_lock); |
330 | ++sdev->stat.uncorrectable_errors; | 710 | ++sdev->stat.uncorrectable_errors; |
331 | spin_unlock(&sdev->stat_lock); | 711 | spin_unlock(&sdev->stat_lock); |
332 | 712 | ||
333 | if (printk_ratelimit()) | 713 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " |
334 | printk(KERN_ERR "btrfs: unable to fixup at %llu\n", | 714 | "logical %llu\n", (unsigned long long)logical); |
335 | (unsigned long long)logical); | ||
336 | } | 715 | } |
337 | 716 | ||
338 | static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, | 717 | static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, |
@@ -382,8 +761,14 @@ static void scrub_checksum(struct btrfs_work *work) | |||
382 | int ret; | 761 | int ret; |
383 | 762 | ||
384 | if (sbio->err) { | 763 | if (sbio->err) { |
764 | ret = 0; | ||
385 | for (i = 0; i < sbio->count; ++i) | 765 | for (i = 0; i < sbio->count; ++i) |
386 | scrub_recheck_error(sbio, i); | 766 | ret |= scrub_recheck_error(sbio, i); |
767 | if (!ret) { | ||
768 | spin_lock(&sdev->stat_lock); | ||
769 | ++sdev->stat.unverified_errors; | ||
770 | spin_unlock(&sdev->stat_lock); | ||
771 | } | ||
387 | 772 | ||
388 | sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); | 773 | sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); |
389 | sbio->bio->bi_flags |= 1 << BIO_UPTODATE; | 774 | sbio->bio->bi_flags |= 1 << BIO_UPTODATE; |
@@ -396,10 +781,6 @@ static void scrub_checksum(struct btrfs_work *work) | |||
396 | bi->bv_offset = 0; | 781 | bi->bv_offset = 0; |
397 | bi->bv_len = PAGE_SIZE; | 782 | bi->bv_len = PAGE_SIZE; |
398 | } | 783 | } |
399 | |||
400 | spin_lock(&sdev->stat_lock); | ||
401 | ++sdev->stat.read_errors; | ||
402 | spin_unlock(&sdev->stat_lock); | ||
403 | goto out; | 784 | goto out; |
404 | } | 785 | } |
405 | for (i = 0; i < sbio->count; ++i) { | 786 | for (i = 0; i < sbio->count; ++i) { |
@@ -420,8 +801,14 @@ static void scrub_checksum(struct btrfs_work *work) | |||
420 | WARN_ON(1); | 801 | WARN_ON(1); |
421 | } | 802 | } |
422 | kunmap_atomic(buffer, KM_USER0); | 803 | kunmap_atomic(buffer, KM_USER0); |
423 | if (ret) | 804 | if (ret) { |
424 | scrub_recheck_error(sbio, i); | 805 | ret = scrub_recheck_error(sbio, i); |
806 | if (!ret) { | ||
807 | spin_lock(&sdev->stat_lock); | ||
808 | ++sdev->stat.unverified_errors; | ||
809 | spin_unlock(&sdev->stat_lock); | ||
810 | } | ||
811 | } | ||
425 | } | 812 | } |
426 | 813 | ||
427 | out: | 814 | out: |
@@ -604,7 +991,7 @@ nomem: | |||
604 | } | 991 | } |
605 | 992 | ||
606 | static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, | 993 | static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, |
607 | u64 physical, u64 flags, u64 gen, u64 mirror_num, | 994 | u64 physical, u64 flags, u64 gen, int mirror_num, |
608 | u8 *csum, int force) | 995 | u8 *csum, int force) |
609 | { | 996 | { |
610 | struct scrub_bio *sbio; | 997 | struct scrub_bio *sbio; |
@@ -701,7 +1088,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | |||
701 | 1088 | ||
702 | /* scrub extent tries to collect up to 64 kB for each bio */ | 1089 | /* scrub extent tries to collect up to 64 kB for each bio */ |
703 | static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, | 1090 | static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, |
704 | u64 physical, u64 flags, u64 gen, u64 mirror_num) | 1091 | u64 physical, u64 flags, u64 gen, int mirror_num) |
705 | { | 1092 | { |
706 | int ret; | 1093 | int ret; |
707 | u8 csum[BTRFS_CSUM_SIZE]; | 1094 | u8 csum[BTRFS_CSUM_SIZE]; |
@@ -741,13 +1128,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
741 | int slot; | 1128 | int slot; |
742 | int i; | 1129 | int i; |
743 | u64 nstripes; | 1130 | u64 nstripes; |
744 | int start_stripe; | ||
745 | struct extent_buffer *l; | 1131 | struct extent_buffer *l; |
746 | struct btrfs_key key; | 1132 | struct btrfs_key key; |
747 | u64 physical; | 1133 | u64 physical; |
748 | u64 logical; | 1134 | u64 logical; |
749 | u64 generation; | 1135 | u64 generation; |
750 | u64 mirror_num; | 1136 | int mirror_num; |
1137 | struct reada_control *reada1; | ||
1138 | struct reada_control *reada2; | ||
1139 | struct btrfs_key key_start; | ||
1140 | struct btrfs_key key_end; | ||
751 | 1141 | ||
752 | u64 increment = map->stripe_len; | 1142 | u64 increment = map->stripe_len; |
753 | u64 offset; | 1143 | u64 offset; |
@@ -758,102 +1148,88 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
758 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 1148 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
759 | offset = map->stripe_len * num; | 1149 | offset = map->stripe_len * num; |
760 | increment = map->stripe_len * map->num_stripes; | 1150 | increment = map->stripe_len * map->num_stripes; |
761 | mirror_num = 0; | 1151 | mirror_num = 1; |
762 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | 1152 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { |
763 | int factor = map->num_stripes / map->sub_stripes; | 1153 | int factor = map->num_stripes / map->sub_stripes; |
764 | offset = map->stripe_len * (num / map->sub_stripes); | 1154 | offset = map->stripe_len * (num / map->sub_stripes); |
765 | increment = map->stripe_len * factor; | 1155 | increment = map->stripe_len * factor; |
766 | mirror_num = num % map->sub_stripes; | 1156 | mirror_num = num % map->sub_stripes + 1; |
767 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | 1157 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { |
768 | increment = map->stripe_len; | 1158 | increment = map->stripe_len; |
769 | mirror_num = num % map->num_stripes; | 1159 | mirror_num = num % map->num_stripes + 1; |
770 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { | 1160 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { |
771 | increment = map->stripe_len; | 1161 | increment = map->stripe_len; |
772 | mirror_num = num % map->num_stripes; | 1162 | mirror_num = num % map->num_stripes + 1; |
773 | } else { | 1163 | } else { |
774 | increment = map->stripe_len; | 1164 | increment = map->stripe_len; |
775 | mirror_num = 0; | 1165 | mirror_num = 1; |
776 | } | 1166 | } |
777 | 1167 | ||
778 | path = btrfs_alloc_path(); | 1168 | path = btrfs_alloc_path(); |
779 | if (!path) | 1169 | if (!path) |
780 | return -ENOMEM; | 1170 | return -ENOMEM; |
781 | 1171 | ||
782 | path->reada = 2; | ||
783 | path->search_commit_root = 1; | 1172 | path->search_commit_root = 1; |
784 | path->skip_locking = 1; | 1173 | path->skip_locking = 1; |
785 | 1174 | ||
786 | /* | 1175 | /* |
787 | * find all extents for each stripe and just read them to get | 1176 | * trigger the readahead for extent tree csum tree and wait for |
788 | * them into the page cache | 1177 | * completion. During readahead, the scrub is officially paused |
789 | * FIXME: we can do better. build a more intelligent prefetching | 1178 | * to not hold off transaction commits |
790 | */ | 1179 | */ |
791 | logical = base + offset; | 1180 | logical = base + offset; |
792 | physical = map->stripes[num].physical; | ||
793 | ret = 0; | ||
794 | for (i = 0; i < nstripes; ++i) { | ||
795 | key.objectid = logical; | ||
796 | key.type = BTRFS_EXTENT_ITEM_KEY; | ||
797 | key.offset = (u64)0; | ||
798 | |||
799 | ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); | ||
800 | if (ret < 0) | ||
801 | goto out_noplug; | ||
802 | |||
803 | /* | ||
804 | * we might miss half an extent here, but that doesn't matter, | ||
805 | * as it's only the prefetch | ||
806 | */ | ||
807 | while (1) { | ||
808 | l = path->nodes[0]; | ||
809 | slot = path->slots[0]; | ||
810 | if (slot >= btrfs_header_nritems(l)) { | ||
811 | ret = btrfs_next_leaf(root, path); | ||
812 | if (ret == 0) | ||
813 | continue; | ||
814 | if (ret < 0) | ||
815 | goto out_noplug; | ||
816 | 1181 | ||
817 | break; | 1182 | wait_event(sdev->list_wait, |
818 | } | 1183 | atomic_read(&sdev->in_flight) == 0); |
819 | btrfs_item_key_to_cpu(l, &key, slot); | 1184 | atomic_inc(&fs_info->scrubs_paused); |
1185 | wake_up(&fs_info->scrub_pause_wait); | ||
820 | 1186 | ||
821 | if (key.objectid >= logical + map->stripe_len) | 1187 | /* FIXME it might be better to start readahead at commit root */ |
822 | break; | 1188 | key_start.objectid = logical; |
1189 | key_start.type = BTRFS_EXTENT_ITEM_KEY; | ||
1190 | key_start.offset = (u64)0; | ||
1191 | key_end.objectid = base + offset + nstripes * increment; | ||
1192 | key_end.type = BTRFS_EXTENT_ITEM_KEY; | ||
1193 | key_end.offset = (u64)0; | ||
1194 | reada1 = btrfs_reada_add(root, &key_start, &key_end); | ||
1195 | |||
1196 | key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
1197 | key_start.type = BTRFS_EXTENT_CSUM_KEY; | ||
1198 | key_start.offset = logical; | ||
1199 | key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID; | ||
1200 | key_end.type = BTRFS_EXTENT_CSUM_KEY; | ||
1201 | key_end.offset = base + offset + nstripes * increment; | ||
1202 | reada2 = btrfs_reada_add(csum_root, &key_start, &key_end); | ||
1203 | |||
1204 | if (!IS_ERR(reada1)) | ||
1205 | btrfs_reada_wait(reada1); | ||
1206 | if (!IS_ERR(reada2)) | ||
1207 | btrfs_reada_wait(reada2); | ||
823 | 1208 | ||
824 | path->slots[0]++; | 1209 | mutex_lock(&fs_info->scrub_lock); |
825 | } | 1210 | while (atomic_read(&fs_info->scrub_pause_req)) { |
826 | btrfs_release_path(path); | 1211 | mutex_unlock(&fs_info->scrub_lock); |
827 | logical += increment; | 1212 | wait_event(fs_info->scrub_pause_wait, |
828 | physical += map->stripe_len; | 1213 | atomic_read(&fs_info->scrub_pause_req) == 0); |
829 | cond_resched(); | 1214 | mutex_lock(&fs_info->scrub_lock); |
830 | } | 1215 | } |
1216 | atomic_dec(&fs_info->scrubs_paused); | ||
1217 | mutex_unlock(&fs_info->scrub_lock); | ||
1218 | wake_up(&fs_info->scrub_pause_wait); | ||
831 | 1219 | ||
832 | /* | 1220 | /* |
833 | * collect all data csums for the stripe to avoid seeking during | 1221 | * collect all data csums for the stripe to avoid seeking during |
834 | * the scrub. This might currently (crc32) end up to be about 1MB | 1222 | * the scrub. This might currently (crc32) end up to be about 1MB |
835 | */ | 1223 | */ |
836 | start_stripe = 0; | ||
837 | blk_start_plug(&plug); | 1224 | blk_start_plug(&plug); |
838 | again: | ||
839 | logical = base + offset + start_stripe * increment; | ||
840 | for (i = start_stripe; i < nstripes; ++i) { | ||
841 | ret = btrfs_lookup_csums_range(csum_root, logical, | ||
842 | logical + map->stripe_len - 1, | ||
843 | &sdev->csum_list, 1); | ||
844 | if (ret) | ||
845 | goto out; | ||
846 | 1225 | ||
847 | logical += increment; | ||
848 | cond_resched(); | ||
849 | } | ||
850 | /* | 1226 | /* |
851 | * now find all extents for each stripe and scrub them | 1227 | * now find all extents for each stripe and scrub them |
852 | */ | 1228 | */ |
853 | logical = base + offset + start_stripe * increment; | 1229 | logical = base + offset; |
854 | physical = map->stripes[num].physical + start_stripe * map->stripe_len; | 1230 | physical = map->stripes[num].physical; |
855 | ret = 0; | 1231 | ret = 0; |
856 | for (i = start_stripe; i < nstripes; ++i) { | 1232 | for (i = 0; i < nstripes; ++i) { |
857 | /* | 1233 | /* |
858 | * canceled? | 1234 | * canceled? |
859 | */ | 1235 | */ |
@@ -882,11 +1258,14 @@ again: | |||
882 | atomic_dec(&fs_info->scrubs_paused); | 1258 | atomic_dec(&fs_info->scrubs_paused); |
883 | mutex_unlock(&fs_info->scrub_lock); | 1259 | mutex_unlock(&fs_info->scrub_lock); |
884 | wake_up(&fs_info->scrub_pause_wait); | 1260 | wake_up(&fs_info->scrub_pause_wait); |
885 | scrub_free_csums(sdev); | ||
886 | start_stripe = i; | ||
887 | goto again; | ||
888 | } | 1261 | } |
889 | 1262 | ||
1263 | ret = btrfs_lookup_csums_range(csum_root, logical, | ||
1264 | logical + map->stripe_len - 1, | ||
1265 | &sdev->csum_list, 1); | ||
1266 | if (ret) | ||
1267 | goto out; | ||
1268 | |||
890 | key.objectid = logical; | 1269 | key.objectid = logical; |
891 | key.type = BTRFS_EXTENT_ITEM_KEY; | 1270 | key.type = BTRFS_EXTENT_ITEM_KEY; |
892 | key.offset = (u64)0; | 1271 | key.offset = (u64)0; |
@@ -982,7 +1361,6 @@ next: | |||
982 | 1361 | ||
983 | out: | 1362 | out: |
984 | blk_finish_plug(&plug); | 1363 | blk_finish_plug(&plug); |
985 | out_noplug: | ||
986 | btrfs_free_path(path); | 1364 | btrfs_free_path(path); |
987 | return ret < 0 ? ret : 0; | 1365 | return ret < 0 ? ret : 0; |
988 | } | 1366 | } |
@@ -1253,10 +1631,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, | |||
1253 | ret = scrub_enumerate_chunks(sdev, start, end); | 1631 | ret = scrub_enumerate_chunks(sdev, start, end); |
1254 | 1632 | ||
1255 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); | 1633 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); |
1256 | |||
1257 | atomic_dec(&fs_info->scrubs_running); | 1634 | atomic_dec(&fs_info->scrubs_running); |
1258 | wake_up(&fs_info->scrub_pause_wait); | 1635 | wake_up(&fs_info->scrub_pause_wait); |
1259 | 1636 | ||
1637 | wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); | ||
1638 | |||
1260 | if (progress) | 1639 | if (progress) |
1261 | memcpy(progress, &sdev->stat, sizeof(*progress)); | 1640 | memcpy(progress, &sdev->stat, sizeof(*progress)); |
1262 | 1641 | ||