diff options
Diffstat (limited to 'fs/btrfs/scrub.c')
-rw-r--r-- | fs/btrfs/scrub.c | 476 |
1 files changed, 433 insertions, 43 deletions
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5bc4ec827b3d..94cd3a19e9c8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c | |||
@@ -17,10 +17,14 @@ | |||
17 | */ | 17 | */ |
18 | 18 | ||
19 | #include <linux/blkdev.h> | 19 | #include <linux/blkdev.h> |
20 | #include <linux/ratelimit.h> | ||
20 | #include "ctree.h" | 21 | #include "ctree.h" |
21 | #include "volumes.h" | 22 | #include "volumes.h" |
22 | #include "disk-io.h" | 23 | #include "disk-io.h" |
23 | #include "ordered-data.h" | 24 | #include "ordered-data.h" |
25 | #include "transaction.h" | ||
26 | #include "backref.h" | ||
27 | #include "extent_io.h" | ||
24 | 28 | ||
25 | /* | 29 | /* |
26 | * This is only the first step towards a full-features scrub. It reads all | 30 | * This is only the first step towards a full-features scrub. It reads all |
@@ -60,7 +64,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix); | |||
60 | struct scrub_page { | 64 | struct scrub_page { |
61 | u64 flags; /* extent flags */ | 65 | u64 flags; /* extent flags */ |
62 | u64 generation; | 66 | u64 generation; |
63 | u64 mirror_num; | 67 | int mirror_num; |
64 | int have_csum; | 68 | int have_csum; |
65 | u8 csum[BTRFS_CSUM_SIZE]; | 69 | u8 csum[BTRFS_CSUM_SIZE]; |
66 | }; | 70 | }; |
@@ -84,6 +88,7 @@ struct scrub_dev { | |||
84 | int first_free; | 88 | int first_free; |
85 | int curr; | 89 | int curr; |
86 | atomic_t in_flight; | 90 | atomic_t in_flight; |
91 | atomic_t fixup_cnt; | ||
87 | spinlock_t list_lock; | 92 | spinlock_t list_lock; |
88 | wait_queue_head_t list_wait; | 93 | wait_queue_head_t list_wait; |
89 | u16 csum_size; | 94 | u16 csum_size; |
@@ -97,6 +102,27 @@ struct scrub_dev { | |||
97 | spinlock_t stat_lock; | 102 | spinlock_t stat_lock; |
98 | }; | 103 | }; |
99 | 104 | ||
105 | struct scrub_fixup_nodatasum { | ||
106 | struct scrub_dev *sdev; | ||
107 | u64 logical; | ||
108 | struct btrfs_root *root; | ||
109 | struct btrfs_work work; | ||
110 | int mirror_num; | ||
111 | }; | ||
112 | |||
113 | struct scrub_warning { | ||
114 | struct btrfs_path *path; | ||
115 | u64 extent_item_size; | ||
116 | char *scratch_buf; | ||
117 | char *msg_buf; | ||
118 | const char *errstr; | ||
119 | sector_t sector; | ||
120 | u64 logical; | ||
121 | struct btrfs_device *dev; | ||
122 | int msg_bufsize; | ||
123 | int scratch_bufsize; | ||
124 | }; | ||
125 | |||
100 | static void scrub_free_csums(struct scrub_dev *sdev) | 126 | static void scrub_free_csums(struct scrub_dev *sdev) |
101 | { | 127 | { |
102 | while (!list_empty(&sdev->csum_list)) { | 128 | while (!list_empty(&sdev->csum_list)) { |
@@ -172,12 +198,13 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev) | |||
172 | 198 | ||
173 | if (i != SCRUB_BIOS_PER_DEV-1) | 199 | if (i != SCRUB_BIOS_PER_DEV-1) |
174 | sdev->bios[i]->next_free = i + 1; | 200 | sdev->bios[i]->next_free = i + 1; |
175 | else | 201 | else |
176 | sdev->bios[i]->next_free = -1; | 202 | sdev->bios[i]->next_free = -1; |
177 | } | 203 | } |
178 | sdev->first_free = 0; | 204 | sdev->first_free = 0; |
179 | sdev->curr = -1; | 205 | sdev->curr = -1; |
180 | atomic_set(&sdev->in_flight, 0); | 206 | atomic_set(&sdev->in_flight, 0); |
207 | atomic_set(&sdev->fixup_cnt, 0); | ||
181 | atomic_set(&sdev->cancel_req, 0); | 208 | atomic_set(&sdev->cancel_req, 0); |
182 | sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); | 209 | sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy); |
183 | INIT_LIST_HEAD(&sdev->csum_list); | 210 | INIT_LIST_HEAD(&sdev->csum_list); |
@@ -192,24 +219,361 @@ nomem: | |||
192 | return ERR_PTR(-ENOMEM); | 219 | return ERR_PTR(-ENOMEM); |
193 | } | 220 | } |
194 | 221 | ||
222 | static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx) | ||
223 | { | ||
224 | u64 isize; | ||
225 | u32 nlink; | ||
226 | int ret; | ||
227 | int i; | ||
228 | struct extent_buffer *eb; | ||
229 | struct btrfs_inode_item *inode_item; | ||
230 | struct scrub_warning *swarn = ctx; | ||
231 | struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info; | ||
232 | struct inode_fs_paths *ipath = NULL; | ||
233 | struct btrfs_root *local_root; | ||
234 | struct btrfs_key root_key; | ||
235 | |||
236 | root_key.objectid = root; | ||
237 | root_key.type = BTRFS_ROOT_ITEM_KEY; | ||
238 | root_key.offset = (u64)-1; | ||
239 | local_root = btrfs_read_fs_root_no_name(fs_info, &root_key); | ||
240 | if (IS_ERR(local_root)) { | ||
241 | ret = PTR_ERR(local_root); | ||
242 | goto err; | ||
243 | } | ||
244 | |||
245 | ret = inode_item_info(inum, 0, local_root, swarn->path); | ||
246 | if (ret) { | ||
247 | btrfs_release_path(swarn->path); | ||
248 | goto err; | ||
249 | } | ||
250 | |||
251 | eb = swarn->path->nodes[0]; | ||
252 | inode_item = btrfs_item_ptr(eb, swarn->path->slots[0], | ||
253 | struct btrfs_inode_item); | ||
254 | isize = btrfs_inode_size(eb, inode_item); | ||
255 | nlink = btrfs_inode_nlink(eb, inode_item); | ||
256 | btrfs_release_path(swarn->path); | ||
257 | |||
258 | ipath = init_ipath(4096, local_root, swarn->path); | ||
259 | ret = paths_from_inode(inum, ipath); | ||
260 | |||
261 | if (ret < 0) | ||
262 | goto err; | ||
263 | |||
264 | /* | ||
265 | * we deliberately ignore the bit ipath might have been too small to | ||
266 | * hold all of the paths here | ||
267 | */ | ||
268 | for (i = 0; i < ipath->fspath->elem_cnt; ++i) | ||
269 | printk(KERN_WARNING "btrfs: %s at logical %llu on dev " | ||
270 | "%s, sector %llu, root %llu, inode %llu, offset %llu, " | ||
271 | "length %llu, links %u (path: %s)\n", swarn->errstr, | ||
272 | swarn->logical, swarn->dev->name, | ||
273 | (unsigned long long)swarn->sector, root, inum, offset, | ||
274 | min(isize - offset, (u64)PAGE_SIZE), nlink, | ||
275 | ipath->fspath->str[i]); | ||
276 | |||
277 | free_ipath(ipath); | ||
278 | return 0; | ||
279 | |||
280 | err: | ||
281 | printk(KERN_WARNING "btrfs: %s at logical %llu on dev " | ||
282 | "%s, sector %llu, root %llu, inode %llu, offset %llu: path " | ||
283 | "resolving failed with ret=%d\n", swarn->errstr, | ||
284 | swarn->logical, swarn->dev->name, | ||
285 | (unsigned long long)swarn->sector, root, inum, offset, ret); | ||
286 | |||
287 | free_ipath(ipath); | ||
288 | return 0; | ||
289 | } | ||
290 | |||
291 | static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio, | ||
292 | int ix) | ||
293 | { | ||
294 | struct btrfs_device *dev = sbio->sdev->dev; | ||
295 | struct btrfs_fs_info *fs_info = dev->dev_root->fs_info; | ||
296 | struct btrfs_path *path; | ||
297 | struct btrfs_key found_key; | ||
298 | struct extent_buffer *eb; | ||
299 | struct btrfs_extent_item *ei; | ||
300 | struct scrub_warning swarn; | ||
301 | u32 item_size; | ||
302 | int ret; | ||
303 | u64 ref_root; | ||
304 | u8 ref_level; | ||
305 | unsigned long ptr = 0; | ||
306 | const int bufsize = 4096; | ||
307 | u64 extent_offset; | ||
308 | |||
309 | path = btrfs_alloc_path(); | ||
310 | |||
311 | swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); | ||
312 | swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); | ||
313 | swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9; | ||
314 | swarn.logical = sbio->logical + ix * PAGE_SIZE; | ||
315 | swarn.errstr = errstr; | ||
316 | swarn.dev = dev; | ||
317 | swarn.msg_bufsize = bufsize; | ||
318 | swarn.scratch_bufsize = bufsize; | ||
319 | |||
320 | if (!path || !swarn.scratch_buf || !swarn.msg_buf) | ||
321 | goto out; | ||
322 | |||
323 | ret = extent_from_logical(fs_info, swarn.logical, path, &found_key); | ||
324 | if (ret < 0) | ||
325 | goto out; | ||
326 | |||
327 | extent_offset = swarn.logical - found_key.objectid; | ||
328 | swarn.extent_item_size = found_key.offset; | ||
329 | |||
330 | eb = path->nodes[0]; | ||
331 | ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item); | ||
332 | item_size = btrfs_item_size_nr(eb, path->slots[0]); | ||
333 | |||
334 | if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) { | ||
335 | do { | ||
336 | ret = tree_backref_for_extent(&ptr, eb, ei, item_size, | ||
337 | &ref_root, &ref_level); | ||
338 | printk(KERN_WARNING "%s at logical %llu on dev %s, " | ||
339 | "sector %llu: metadata %s (level %d) in tree " | ||
340 | "%llu\n", errstr, swarn.logical, dev->name, | ||
341 | (unsigned long long)swarn.sector, | ||
342 | ref_level ? "node" : "leaf", | ||
343 | ret < 0 ? -1 : ref_level, | ||
344 | ret < 0 ? -1 : ref_root); | ||
345 | } while (ret != 1); | ||
346 | } else { | ||
347 | swarn.path = path; | ||
348 | iterate_extent_inodes(fs_info, path, found_key.objectid, | ||
349 | extent_offset, | ||
350 | scrub_print_warning_inode, &swarn); | ||
351 | } | ||
352 | |||
353 | out: | ||
354 | btrfs_free_path(path); | ||
355 | kfree(swarn.scratch_buf); | ||
356 | kfree(swarn.msg_buf); | ||
357 | } | ||
358 | |||
359 | static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx) | ||
360 | { | ||
361 | struct page *page = NULL; | ||
362 | unsigned long index; | ||
363 | struct scrub_fixup_nodatasum *fixup = ctx; | ||
364 | int ret; | ||
365 | int corrected = 0; | ||
366 | struct btrfs_key key; | ||
367 | struct inode *inode = NULL; | ||
368 | u64 end = offset + PAGE_SIZE - 1; | ||
369 | struct btrfs_root *local_root; | ||
370 | |||
371 | key.objectid = root; | ||
372 | key.type = BTRFS_ROOT_ITEM_KEY; | ||
373 | key.offset = (u64)-1; | ||
374 | local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key); | ||
375 | if (IS_ERR(local_root)) | ||
376 | return PTR_ERR(local_root); | ||
377 | |||
378 | key.type = BTRFS_INODE_ITEM_KEY; | ||
379 | key.objectid = inum; | ||
380 | key.offset = 0; | ||
381 | inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL); | ||
382 | if (IS_ERR(inode)) | ||
383 | return PTR_ERR(inode); | ||
384 | |||
385 | index = offset >> PAGE_CACHE_SHIFT; | ||
386 | |||
387 | page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); | ||
388 | if (!page) { | ||
389 | ret = -ENOMEM; | ||
390 | goto out; | ||
391 | } | ||
392 | |||
393 | if (PageUptodate(page)) { | ||
394 | struct btrfs_mapping_tree *map_tree; | ||
395 | if (PageDirty(page)) { | ||
396 | /* | ||
397 | * we need to write the data to the defect sector. the | ||
398 | * data that was in that sector is not in memory, | ||
399 | * because the page was modified. we must not write the | ||
400 | * modified page to that sector. | ||
401 | * | ||
402 | * TODO: what could be done here: wait for the delalloc | ||
403 | * runner to write out that page (might involve | ||
404 | * COW) and see whether the sector is still | ||
405 | * referenced afterwards. | ||
406 | * | ||
407 | * For the meantime, we'll treat this error | ||
408 | * incorrectable, although there is a chance that a | ||
409 | * later scrub will find the bad sector again and that | ||
410 | * there's no dirty page in memory, then. | ||
411 | */ | ||
412 | ret = -EIO; | ||
413 | goto out; | ||
414 | } | ||
415 | map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; | ||
416 | ret = repair_io_failure(map_tree, offset, PAGE_SIZE, | ||
417 | fixup->logical, page, | ||
418 | fixup->mirror_num); | ||
419 | unlock_page(page); | ||
420 | corrected = !ret; | ||
421 | } else { | ||
422 | /* | ||
423 | * we need to get good data first. the general readpage path | ||
424 | * will call repair_io_failure for us, we just have to make | ||
425 | * sure we read the bad mirror. | ||
426 | */ | ||
427 | ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, | ||
428 | EXTENT_DAMAGED, GFP_NOFS); | ||
429 | if (ret) { | ||
430 | /* set_extent_bits should give proper error */ | ||
431 | WARN_ON(ret > 0); | ||
432 | if (ret > 0) | ||
433 | ret = -EFAULT; | ||
434 | goto out; | ||
435 | } | ||
436 | |||
437 | ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page, | ||
438 | btrfs_get_extent, | ||
439 | fixup->mirror_num); | ||
440 | wait_on_page_locked(page); | ||
441 | |||
442 | corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, | ||
443 | end, EXTENT_DAMAGED, 0, NULL); | ||
444 | if (!corrected) | ||
445 | clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end, | ||
446 | EXTENT_DAMAGED, GFP_NOFS); | ||
447 | } | ||
448 | |||
449 | out: | ||
450 | if (page) | ||
451 | put_page(page); | ||
452 | if (inode) | ||
453 | iput(inode); | ||
454 | |||
455 | if (ret < 0) | ||
456 | return ret; | ||
457 | |||
458 | if (ret == 0 && corrected) { | ||
459 | /* | ||
460 | * we only need to call readpage for one of the inodes belonging | ||
461 | * to this extent. so make iterate_extent_inodes stop | ||
462 | */ | ||
463 | return 1; | ||
464 | } | ||
465 | |||
466 | return -EIO; | ||
467 | } | ||
468 | |||
469 | static void scrub_fixup_nodatasum(struct btrfs_work *work) | ||
470 | { | ||
471 | int ret; | ||
472 | struct scrub_fixup_nodatasum *fixup; | ||
473 | struct scrub_dev *sdev; | ||
474 | struct btrfs_trans_handle *trans = NULL; | ||
475 | struct btrfs_fs_info *fs_info; | ||
476 | struct btrfs_path *path; | ||
477 | int uncorrectable = 0; | ||
478 | |||
479 | fixup = container_of(work, struct scrub_fixup_nodatasum, work); | ||
480 | sdev = fixup->sdev; | ||
481 | fs_info = fixup->root->fs_info; | ||
482 | |||
483 | path = btrfs_alloc_path(); | ||
484 | if (!path) { | ||
485 | spin_lock(&sdev->stat_lock); | ||
486 | ++sdev->stat.malloc_errors; | ||
487 | spin_unlock(&sdev->stat_lock); | ||
488 | uncorrectable = 1; | ||
489 | goto out; | ||
490 | } | ||
491 | |||
492 | trans = btrfs_join_transaction(fixup->root); | ||
493 | if (IS_ERR(trans)) { | ||
494 | uncorrectable = 1; | ||
495 | goto out; | ||
496 | } | ||
497 | |||
498 | /* | ||
499 | * the idea is to trigger a regular read through the standard path. we | ||
500 | * read a page from the (failed) logical address by specifying the | ||
501 | * corresponding copynum of the failed sector. thus, that readpage is | ||
502 | * expected to fail. | ||
503 | * that is the point where on-the-fly error correction will kick in | ||
504 | * (once it's finished) and rewrite the failed sector if a good copy | ||
505 | * can be found. | ||
506 | */ | ||
507 | ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info, | ||
508 | path, scrub_fixup_readpage, | ||
509 | fixup); | ||
510 | if (ret < 0) { | ||
511 | uncorrectable = 1; | ||
512 | goto out; | ||
513 | } | ||
514 | WARN_ON(ret != 1); | ||
515 | |||
516 | spin_lock(&sdev->stat_lock); | ||
517 | ++sdev->stat.corrected_errors; | ||
518 | spin_unlock(&sdev->stat_lock); | ||
519 | |||
520 | out: | ||
521 | if (trans && !IS_ERR(trans)) | ||
522 | btrfs_end_transaction(trans, fixup->root); | ||
523 | if (uncorrectable) { | ||
524 | spin_lock(&sdev->stat_lock); | ||
525 | ++sdev->stat.uncorrectable_errors; | ||
526 | spin_unlock(&sdev->stat_lock); | ||
527 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup " | ||
528 | "(nodatasum) error at logical %llu\n", | ||
529 | fixup->logical); | ||
530 | } | ||
531 | |||
532 | btrfs_free_path(path); | ||
533 | kfree(fixup); | ||
534 | |||
535 | /* see caller why we're pretending to be paused in the scrub counters */ | ||
536 | mutex_lock(&fs_info->scrub_lock); | ||
537 | atomic_dec(&fs_info->scrubs_running); | ||
538 | atomic_dec(&fs_info->scrubs_paused); | ||
539 | mutex_unlock(&fs_info->scrub_lock); | ||
540 | atomic_dec(&sdev->fixup_cnt); | ||
541 | wake_up(&fs_info->scrub_pause_wait); | ||
542 | wake_up(&sdev->list_wait); | ||
543 | } | ||
544 | |||
195 | /* | 545 | /* |
196 | * scrub_recheck_error gets called when either verification of the page | 546 | * scrub_recheck_error gets called when either verification of the page |
197 | * failed or the bio failed to read, e.g. with EIO. In the latter case, | 547 | * failed or the bio failed to read, e.g. with EIO. In the latter case, |
198 | * recheck_error gets called for every page in the bio, even though only | 548 | * recheck_error gets called for every page in the bio, even though only |
199 | * one may be bad | 549 | * one may be bad |
200 | */ | 550 | */ |
201 | static void scrub_recheck_error(struct scrub_bio *sbio, int ix) | 551 | static int scrub_recheck_error(struct scrub_bio *sbio, int ix) |
202 | { | 552 | { |
553 | struct scrub_dev *sdev = sbio->sdev; | ||
554 | u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9; | ||
555 | static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, | ||
556 | DEFAULT_RATELIMIT_BURST); | ||
557 | |||
203 | if (sbio->err) { | 558 | if (sbio->err) { |
204 | if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, | 559 | if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector, |
205 | (sbio->physical + ix * PAGE_SIZE) >> 9, | ||
206 | sbio->bio->bi_io_vec[ix].bv_page) == 0) { | 560 | sbio->bio->bi_io_vec[ix].bv_page) == 0) { |
207 | if (scrub_fixup_check(sbio, ix) == 0) | 561 | if (scrub_fixup_check(sbio, ix) == 0) |
208 | return; | 562 | return 0; |
209 | } | 563 | } |
564 | if (__ratelimit(&_rs)) | ||
565 | scrub_print_warning("i/o error", sbio, ix); | ||
566 | } else { | ||
567 | if (__ratelimit(&_rs)) | ||
568 | scrub_print_warning("checksum error", sbio, ix); | ||
210 | } | 569 | } |
211 | 570 | ||
571 | spin_lock(&sdev->stat_lock); | ||
572 | ++sdev->stat.read_errors; | ||
573 | spin_unlock(&sdev->stat_lock); | ||
574 | |||
212 | scrub_fixup(sbio, ix); | 575 | scrub_fixup(sbio, ix); |
576 | return 1; | ||
213 | } | 577 | } |
214 | 578 | ||
215 | static int scrub_fixup_check(struct scrub_bio *sbio, int ix) | 579 | static int scrub_fixup_check(struct scrub_bio *sbio, int ix) |
@@ -247,7 +611,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
247 | struct scrub_dev *sdev = sbio->sdev; | 611 | struct scrub_dev *sdev = sbio->sdev; |
248 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; | 612 | struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info; |
249 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; | 613 | struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; |
250 | struct btrfs_multi_bio *multi = NULL; | 614 | struct btrfs_bio *bbio = NULL; |
615 | struct scrub_fixup_nodatasum *fixup; | ||
251 | u64 logical = sbio->logical + ix * PAGE_SIZE; | 616 | u64 logical = sbio->logical + ix * PAGE_SIZE; |
252 | u64 length; | 617 | u64 length; |
253 | int i; | 618 | int i; |
@@ -256,18 +621,36 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
256 | 621 | ||
257 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && | 622 | if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) && |
258 | (sbio->spag[ix].have_csum == 0)) { | 623 | (sbio->spag[ix].have_csum == 0)) { |
624 | fixup = kzalloc(sizeof(*fixup), GFP_NOFS); | ||
625 | if (!fixup) | ||
626 | goto uncorrectable; | ||
627 | fixup->sdev = sdev; | ||
628 | fixup->logical = logical; | ||
629 | fixup->root = fs_info->extent_root; | ||
630 | fixup->mirror_num = sbio->spag[ix].mirror_num; | ||
259 | /* | 631 | /* |
260 | * nodatasum, don't try to fix anything | 632 | * increment scrubs_running to prevent cancel requests from |
261 | * FIXME: we can do better, open the inode and trigger a | 633 | * completing as long as a fixup worker is running. we must also |
262 | * writeback | 634 | * increment scrubs_paused to prevent deadlocking on pause |
635 | * requests used for transactions commits (as the worker uses a | ||
636 | * transaction context). it is safe to regard the fixup worker | ||
637 | * as paused for all matters practical. effectively, we only | ||
638 | * avoid cancellation requests from completing. | ||
263 | */ | 639 | */ |
264 | goto uncorrectable; | 640 | mutex_lock(&fs_info->scrub_lock); |
641 | atomic_inc(&fs_info->scrubs_running); | ||
642 | atomic_inc(&fs_info->scrubs_paused); | ||
643 | mutex_unlock(&fs_info->scrub_lock); | ||
644 | atomic_inc(&sdev->fixup_cnt); | ||
645 | fixup->work.func = scrub_fixup_nodatasum; | ||
646 | btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work); | ||
647 | return; | ||
265 | } | 648 | } |
266 | 649 | ||
267 | length = PAGE_SIZE; | 650 | length = PAGE_SIZE; |
268 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, | 651 | ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, |
269 | &multi, 0); | 652 | &bbio, 0); |
270 | if (ret || !multi || length < PAGE_SIZE) { | 653 | if (ret || !bbio || length < PAGE_SIZE) { |
271 | printk(KERN_ERR | 654 | printk(KERN_ERR |
272 | "scrub_fixup: btrfs_map_block failed us for %llu\n", | 655 | "scrub_fixup: btrfs_map_block failed us for %llu\n", |
273 | (unsigned long long)logical); | 656 | (unsigned long long)logical); |
@@ -275,19 +658,19 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
275 | return; | 658 | return; |
276 | } | 659 | } |
277 | 660 | ||
278 | if (multi->num_stripes == 1) | 661 | if (bbio->num_stripes == 1) |
279 | /* there aren't any replicas */ | 662 | /* there aren't any replicas */ |
280 | goto uncorrectable; | 663 | goto uncorrectable; |
281 | 664 | ||
282 | /* | 665 | /* |
283 | * first find a good copy | 666 | * first find a good copy |
284 | */ | 667 | */ |
285 | for (i = 0; i < multi->num_stripes; ++i) { | 668 | for (i = 0; i < bbio->num_stripes; ++i) { |
286 | if (i == sbio->spag[ix].mirror_num) | 669 | if (i + 1 == sbio->spag[ix].mirror_num) |
287 | continue; | 670 | continue; |
288 | 671 | ||
289 | if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev, | 672 | if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev, |
290 | multi->stripes[i].physical >> 9, | 673 | bbio->stripes[i].physical >> 9, |
291 | sbio->bio->bi_io_vec[ix].bv_page)) { | 674 | sbio->bio->bi_io_vec[ix].bv_page)) { |
292 | /* I/O-error, this is not a good copy */ | 675 | /* I/O-error, this is not a good copy */ |
293 | continue; | 676 | continue; |
@@ -296,7 +679,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
296 | if (scrub_fixup_check(sbio, ix) == 0) | 679 | if (scrub_fixup_check(sbio, ix) == 0) |
297 | break; | 680 | break; |
298 | } | 681 | } |
299 | if (i == multi->num_stripes) | 682 | if (i == bbio->num_stripes) |
300 | goto uncorrectable; | 683 | goto uncorrectable; |
301 | 684 | ||
302 | if (!sdev->readonly) { | 685 | if (!sdev->readonly) { |
@@ -311,25 +694,23 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix) | |||
311 | } | 694 | } |
312 | } | 695 | } |
313 | 696 | ||
314 | kfree(multi); | 697 | kfree(bbio); |
315 | spin_lock(&sdev->stat_lock); | 698 | spin_lock(&sdev->stat_lock); |
316 | ++sdev->stat.corrected_errors; | 699 | ++sdev->stat.corrected_errors; |
317 | spin_unlock(&sdev->stat_lock); | 700 | spin_unlock(&sdev->stat_lock); |
318 | 701 | ||
319 | if (printk_ratelimit()) | 702 | printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n", |
320 | printk(KERN_ERR "btrfs: fixed up at %llu\n", | 703 | (unsigned long long)logical); |
321 | (unsigned long long)logical); | ||
322 | return; | 704 | return; |
323 | 705 | ||
324 | uncorrectable: | 706 | uncorrectable: |
325 | kfree(multi); | 707 | kfree(bbio); |
326 | spin_lock(&sdev->stat_lock); | 708 | spin_lock(&sdev->stat_lock); |
327 | ++sdev->stat.uncorrectable_errors; | 709 | ++sdev->stat.uncorrectable_errors; |
328 | spin_unlock(&sdev->stat_lock); | 710 | spin_unlock(&sdev->stat_lock); |
329 | 711 | ||
330 | if (printk_ratelimit()) | 712 | printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at " |
331 | printk(KERN_ERR "btrfs: unable to fixup at %llu\n", | 713 | "logical %llu\n", (unsigned long long)logical); |
332 | (unsigned long long)logical); | ||
333 | } | 714 | } |
334 | 715 | ||
335 | static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, | 716 | static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector, |
@@ -379,8 +760,14 @@ static void scrub_checksum(struct btrfs_work *work) | |||
379 | int ret; | 760 | int ret; |
380 | 761 | ||
381 | if (sbio->err) { | 762 | if (sbio->err) { |
763 | ret = 0; | ||
382 | for (i = 0; i < sbio->count; ++i) | 764 | for (i = 0; i < sbio->count; ++i) |
383 | scrub_recheck_error(sbio, i); | 765 | ret |= scrub_recheck_error(sbio, i); |
766 | if (!ret) { | ||
767 | spin_lock(&sdev->stat_lock); | ||
768 | ++sdev->stat.unverified_errors; | ||
769 | spin_unlock(&sdev->stat_lock); | ||
770 | } | ||
384 | 771 | ||
385 | sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); | 772 | sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1); |
386 | sbio->bio->bi_flags |= 1 << BIO_UPTODATE; | 773 | sbio->bio->bi_flags |= 1 << BIO_UPTODATE; |
@@ -393,10 +780,6 @@ static void scrub_checksum(struct btrfs_work *work) | |||
393 | bi->bv_offset = 0; | 780 | bi->bv_offset = 0; |
394 | bi->bv_len = PAGE_SIZE; | 781 | bi->bv_len = PAGE_SIZE; |
395 | } | 782 | } |
396 | |||
397 | spin_lock(&sdev->stat_lock); | ||
398 | ++sdev->stat.read_errors; | ||
399 | spin_unlock(&sdev->stat_lock); | ||
400 | goto out; | 783 | goto out; |
401 | } | 784 | } |
402 | for (i = 0; i < sbio->count; ++i) { | 785 | for (i = 0; i < sbio->count; ++i) { |
@@ -417,8 +800,14 @@ static void scrub_checksum(struct btrfs_work *work) | |||
417 | WARN_ON(1); | 800 | WARN_ON(1); |
418 | } | 801 | } |
419 | kunmap_atomic(buffer, KM_USER0); | 802 | kunmap_atomic(buffer, KM_USER0); |
420 | if (ret) | 803 | if (ret) { |
421 | scrub_recheck_error(sbio, i); | 804 | ret = scrub_recheck_error(sbio, i); |
805 | if (!ret) { | ||
806 | spin_lock(&sdev->stat_lock); | ||
807 | ++sdev->stat.unverified_errors; | ||
808 | spin_unlock(&sdev->stat_lock); | ||
809 | } | ||
810 | } | ||
422 | } | 811 | } |
423 | 812 | ||
424 | out: | 813 | out: |
@@ -601,7 +990,7 @@ nomem: | |||
601 | } | 990 | } |
602 | 991 | ||
603 | static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, | 992 | static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len, |
604 | u64 physical, u64 flags, u64 gen, u64 mirror_num, | 993 | u64 physical, u64 flags, u64 gen, int mirror_num, |
605 | u8 *csum, int force) | 994 | u8 *csum, int force) |
606 | { | 995 | { |
607 | struct scrub_bio *sbio; | 996 | struct scrub_bio *sbio; |
@@ -698,7 +1087,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len, | |||
698 | 1087 | ||
699 | /* scrub extent tries to collect up to 64 kB for each bio */ | 1088 | /* scrub extent tries to collect up to 64 kB for each bio */ |
700 | static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, | 1089 | static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len, |
701 | u64 physical, u64 flags, u64 gen, u64 mirror_num) | 1090 | u64 physical, u64 flags, u64 gen, int mirror_num) |
702 | { | 1091 | { |
703 | int ret; | 1092 | int ret; |
704 | u8 csum[BTRFS_CSUM_SIZE]; | 1093 | u8 csum[BTRFS_CSUM_SIZE]; |
@@ -743,7 +1132,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
743 | u64 physical; | 1132 | u64 physical; |
744 | u64 logical; | 1133 | u64 logical; |
745 | u64 generation; | 1134 | u64 generation; |
746 | u64 mirror_num; | 1135 | int mirror_num; |
747 | struct reada_control *reada1; | 1136 | struct reada_control *reada1; |
748 | struct reada_control *reada2; | 1137 | struct reada_control *reada2; |
749 | struct btrfs_key key_start; | 1138 | struct btrfs_key key_start; |
@@ -758,21 +1147,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev, | |||
758 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { | 1147 | if (map->type & BTRFS_BLOCK_GROUP_RAID0) { |
759 | offset = map->stripe_len * num; | 1148 | offset = map->stripe_len * num; |
760 | increment = map->stripe_len * map->num_stripes; | 1149 | increment = map->stripe_len * map->num_stripes; |
761 | mirror_num = 0; | 1150 | mirror_num = 1; |
762 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { | 1151 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { |
763 | int factor = map->num_stripes / map->sub_stripes; | 1152 | int factor = map->num_stripes / map->sub_stripes; |
764 | offset = map->stripe_len * (num / map->sub_stripes); | 1153 | offset = map->stripe_len * (num / map->sub_stripes); |
765 | increment = map->stripe_len * factor; | 1154 | increment = map->stripe_len * factor; |
766 | mirror_num = num % map->sub_stripes; | 1155 | mirror_num = num % map->sub_stripes + 1; |
767 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { | 1156 | } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { |
768 | increment = map->stripe_len; | 1157 | increment = map->stripe_len; |
769 | mirror_num = num % map->num_stripes; | 1158 | mirror_num = num % map->num_stripes + 1; |
770 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { | 1159 | } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { |
771 | increment = map->stripe_len; | 1160 | increment = map->stripe_len; |
772 | mirror_num = num % map->num_stripes; | 1161 | mirror_num = num % map->num_stripes + 1; |
773 | } else { | 1162 | } else { |
774 | increment = map->stripe_len; | 1163 | increment = map->stripe_len; |
775 | mirror_num = 0; | 1164 | mirror_num = 1; |
776 | } | 1165 | } |
777 | 1166 | ||
778 | path = btrfs_alloc_path(); | 1167 | path = btrfs_alloc_path(); |
@@ -1241,10 +1630,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end, | |||
1241 | ret = scrub_enumerate_chunks(sdev, start, end); | 1630 | ret = scrub_enumerate_chunks(sdev, start, end); |
1242 | 1631 | ||
1243 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); | 1632 | wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0); |
1244 | |||
1245 | atomic_dec(&fs_info->scrubs_running); | 1633 | atomic_dec(&fs_info->scrubs_running); |
1246 | wake_up(&fs_info->scrub_pause_wait); | 1634 | wake_up(&fs_info->scrub_pause_wait); |
1247 | 1635 | ||
1636 | wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0); | ||
1637 | |||
1248 | if (progress) | 1638 | if (progress) |
1249 | memcpy(progress, &sdev->stat, sizeof(*progress)); | 1639 | memcpy(progress, &sdev->stat, sizeof(*progress)); |
1250 | 1640 | ||