diff options
author | NeilBrown <neilb@suse.de> | 2011-07-27 21:31:48 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-07-27 21:31:48 -0400 |
commit | de393cdea66cbd63c90725663f400c76faf1b255 (patch) | |
tree | 6a2bf37bee98bf7de42856f904bd23c81e082f8e /drivers/md/md.c | |
parent | d7a9d443bc8a75a24873c0506f50051edfedc714 (diff) |
md: make it easier to wait for bad blocks to be acknowledged.
It is only safe to choose not to write to a bad block if that bad
block is safely recorded in metadata - i.e. if it has been
'acknowledged'.
If it hasn't we need to wait for the acknowledgement.
We support that using rdev->blocked wait and
md_wait_for_blocked_rdev by introducing a new device flag
'BlockedBadBlock'.
This flag is only advisory.
It is cleared whenever we acknowledge a bad block, so that a waiter
can re-check the particular bad blocks that it is interested it.
It should be set by a caller when they find they need to wait.
This (set after test) is inherently racy, but as
md_wait_for_blocked_rdev already has a timeout, losing the race will
have minimal impact.
When we clear "Blocked" was also clear "BlockedBadBlocks" incase it
was set incorrectly (see above race).
We also modify the way we manage 'Blocked' to fit better with the new
handling of 'BlockedBadBlocks' and to make it consistent between
externally managed and internally managed metadata. This requires
that each raidXd loop checks if the metadata needs to be written and
triggers a write (md_check_recovery) if needed. Otherwise a queued
write request might cause raidXd to wait for the metadata to write,
and only that thread can write it.
Before writing metadata, we set FaultRecorded for all devices that
are Faulty, then after writing the metadata we clear Blocked for any
device for which the Fault was certainly Recorded.
The 'faulty' device flag now appears in sysfs if the device is faulty
*or* it has unacknowledged bad blocks. So user-space which does not
understand bad blocks can continue to function correctly.
User space which does, should not assume a device is faulty until it
sees the 'faulty' flag, and then sees the list of unacknowledged bad
blocks is empty.
Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 77 |
1 files changed, 52 insertions, 25 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index 1520d18c5af5..a6b6471da2bc 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -2341,8 +2341,18 @@ repeat: | |||
2341 | if (!mddev->persistent) { | 2341 | if (!mddev->persistent) { |
2342 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2342 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); |
2343 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2343 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
2344 | if (!mddev->external) | 2344 | if (!mddev->external) { |
2345 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 2345 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); |
2346 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2347 | if (rdev->badblocks.changed) { | ||
2348 | md_ack_all_badblocks(&rdev->badblocks); | ||
2349 | md_error(mddev, rdev); | ||
2350 | } | ||
2351 | clear_bit(Blocked, &rdev->flags); | ||
2352 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2353 | wake_up(&rdev->blocked_wait); | ||
2354 | } | ||
2355 | } | ||
2346 | wake_up(&mddev->sb_wait); | 2356 | wake_up(&mddev->sb_wait); |
2347 | return; | 2357 | return; |
2348 | } | 2358 | } |
@@ -2399,9 +2409,12 @@ repeat: | |||
2399 | mddev->events --; | 2409 | mddev->events --; |
2400 | } | 2410 | } |
2401 | 2411 | ||
2402 | list_for_each_entry(rdev, &mddev->disks, same_set) | 2412 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2403 | if (rdev->badblocks.changed) | 2413 | if (rdev->badblocks.changed) |
2404 | any_badblocks_changed++; | 2414 | any_badblocks_changed++; |
2415 | if (test_bit(Faulty, &rdev->flags)) | ||
2416 | set_bit(FaultRecorded, &rdev->flags); | ||
2417 | } | ||
2405 | 2418 | ||
2406 | sync_sbs(mddev, nospares); | 2419 | sync_sbs(mddev, nospares); |
2407 | spin_unlock_irq(&mddev->write_lock); | 2420 | spin_unlock_irq(&mddev->write_lock); |
@@ -2458,9 +2471,15 @@ repeat: | |||
2458 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 2471 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
2459 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 2472 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
2460 | 2473 | ||
2461 | if (any_badblocks_changed) | 2474 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
2462 | list_for_each_entry(rdev, &mddev->disks, same_set) | 2475 | if (test_and_clear_bit(FaultRecorded, &rdev->flags)) |
2476 | clear_bit(Blocked, &rdev->flags); | ||
2477 | |||
2478 | if (any_badblocks_changed) | ||
2463 | md_ack_all_badblocks(&rdev->badblocks); | 2479 | md_ack_all_badblocks(&rdev->badblocks); |
2480 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2481 | wake_up(&rdev->blocked_wait); | ||
2482 | } | ||
2464 | } | 2483 | } |
2465 | 2484 | ||
2466 | /* words written to sysfs files may, or may not, be \n terminated. | 2485 | /* words written to sysfs files may, or may not, be \n terminated. |
@@ -2495,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2495 | char *sep = ""; | 2514 | char *sep = ""; |
2496 | size_t len = 0; | 2515 | size_t len = 0; |
2497 | 2516 | ||
2498 | if (test_bit(Faulty, &rdev->flags)) { | 2517 | if (test_bit(Faulty, &rdev->flags) || |
2518 | rdev->badblocks.unacked_exist) { | ||
2499 | len+= sprintf(page+len, "%sfaulty",sep); | 2519 | len+= sprintf(page+len, "%sfaulty",sep); |
2500 | sep = ","; | 2520 | sep = ","; |
2501 | } | 2521 | } |
@@ -2507,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2507 | len += sprintf(page+len, "%swrite_mostly",sep); | 2527 | len += sprintf(page+len, "%swrite_mostly",sep); |
2508 | sep = ","; | 2528 | sep = ","; |
2509 | } | 2529 | } |
2510 | if (test_bit(Blocked, &rdev->flags)) { | 2530 | if (test_bit(Blocked, &rdev->flags) || |
2531 | rdev->badblocks.unacked_exist) { | ||
2511 | len += sprintf(page+len, "%sblocked", sep); | 2532 | len += sprintf(page+len, "%sblocked", sep); |
2512 | sep = ","; | 2533 | sep = ","; |
2513 | } | 2534 | } |
@@ -2527,12 +2548,12 @@ static ssize_t | |||
2527 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | 2548 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) |
2528 | { | 2549 | { |
2529 | /* can write | 2550 | /* can write |
2530 | * faulty - simulates and error | 2551 | * faulty - simulates an error |
2531 | * remove - disconnects the device | 2552 | * remove - disconnects the device |
2532 | * writemostly - sets write_mostly | 2553 | * writemostly - sets write_mostly |
2533 | * -writemostly - clears write_mostly | 2554 | * -writemostly - clears write_mostly |
2534 | * blocked - sets the Blocked flag | 2555 | * blocked - sets the Blocked flags |
2535 | * -blocked - clears the Blocked flag | 2556 | * -blocked - clears the Blocked and possibly simulates an error |
2536 | * insync - sets Insync providing device isn't active | 2557 | * insync - sets Insync providing device isn't active |
2537 | * write_error - sets WriteErrorSeen | 2558 | * write_error - sets WriteErrorSeen |
2538 | * -write_error - clears WriteErrorSeen | 2559 | * -write_error - clears WriteErrorSeen |
@@ -2562,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2562 | set_bit(Blocked, &rdev->flags); | 2583 | set_bit(Blocked, &rdev->flags); |
2563 | err = 0; | 2584 | err = 0; |
2564 | } else if (cmd_match(buf, "-blocked")) { | 2585 | } else if (cmd_match(buf, "-blocked")) { |
2586 | if (!test_bit(Faulty, &rdev->flags) && | ||
2587 | test_bit(BlockedBadBlocks, &rdev->flags)) { | ||
2588 | /* metadata handler doesn't understand badblocks, | ||
2589 | * so we need to fail the device | ||
2590 | */ | ||
2591 | md_error(rdev->mddev, rdev); | ||
2592 | } | ||
2565 | clear_bit(Blocked, &rdev->flags); | 2593 | clear_bit(Blocked, &rdev->flags); |
2594 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2566 | wake_up(&rdev->blocked_wait); | 2595 | wake_up(&rdev->blocked_wait); |
2567 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2596 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2568 | md_wakeup_thread(rdev->mddev->thread); | 2597 | md_wakeup_thread(rdev->mddev->thread); |
@@ -2881,7 +2910,11 @@ static ssize_t bb_show(mdk_rdev_t *rdev, char *page) | |||
2881 | } | 2910 | } |
2882 | static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) | 2911 | static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) |
2883 | { | 2912 | { |
2884 | return badblocks_store(&rdev->badblocks, page, len, 0); | 2913 | int rv = badblocks_store(&rdev->badblocks, page, len, 0); |
2914 | /* Maybe that ack was all we needed */ | ||
2915 | if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) | ||
2916 | wake_up(&rdev->blocked_wait); | ||
2917 | return rv; | ||
2885 | } | 2918 | } |
2886 | static struct rdev_sysfs_entry rdev_bad_blocks = | 2919 | static struct rdev_sysfs_entry rdev_bad_blocks = |
2887 | __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); | 2920 | __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); |
@@ -6398,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
6398 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 6431 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
6399 | return; | 6432 | return; |
6400 | 6433 | ||
6401 | if (mddev->external) | 6434 | if (!mddev->pers || !mddev->pers->error_handler) |
6402 | set_bit(Blocked, &rdev->flags); | ||
6403 | /* | ||
6404 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | ||
6405 | mdname(mddev), | ||
6406 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), | ||
6407 | __builtin_return_address(0),__builtin_return_address(1), | ||
6408 | __builtin_return_address(2),__builtin_return_address(3)); | ||
6409 | */ | ||
6410 | if (!mddev->pers) | ||
6411 | return; | ||
6412 | if (!mddev->pers->error_handler) | ||
6413 | return; | 6435 | return; |
6414 | mddev->pers->error_handler(mddev,rdev); | 6436 | mddev->pers->error_handler(mddev,rdev); |
6415 | if (mddev->degraded) | 6437 | if (mddev->degraded) |
@@ -7286,8 +7308,7 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7286 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7308 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
7287 | if (rdev->raid_disk >= 0 && | 7309 | if (rdev->raid_disk >= 0 && |
7288 | !test_bit(In_sync, &rdev->flags) && | 7310 | !test_bit(In_sync, &rdev->flags) && |
7289 | !test_bit(Faulty, &rdev->flags) && | 7311 | !test_bit(Faulty, &rdev->flags)) |
7290 | !test_bit(Blocked, &rdev->flags)) | ||
7291 | spares++; | 7312 | spares++; |
7292 | if (rdev->raid_disk < 0 | 7313 | if (rdev->raid_disk < 0 |
7293 | && !test_bit(Faulty, &rdev->flags)) { | 7314 | && !test_bit(Faulty, &rdev->flags)) { |
@@ -7533,7 +7554,8 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | |||
7533 | { | 7554 | { |
7534 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 7555 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
7535 | wait_event_timeout(rdev->blocked_wait, | 7556 | wait_event_timeout(rdev->blocked_wait, |
7536 | !test_bit(Blocked, &rdev->flags), | 7557 | !test_bit(Blocked, &rdev->flags) && |
7558 | !test_bit(BlockedBadBlocks, &rdev->flags), | ||
7537 | msecs_to_jiffies(5000)); | 7559 | msecs_to_jiffies(5000)); |
7538 | rdev_dec_pending(rdev, mddev); | 7560 | rdev_dec_pending(rdev, mddev); |
7539 | } | 7561 | } |
@@ -7779,6 +7801,8 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | |||
7779 | } | 7801 | } |
7780 | 7802 | ||
7781 | bb->changed = 1; | 7803 | bb->changed = 1; |
7804 | if (!acknowledged) | ||
7805 | bb->unacked_exist = 1; | ||
7782 | write_sequnlock_irq(&bb->lock); | 7806 | write_sequnlock_irq(&bb->lock); |
7783 | 7807 | ||
7784 | return rv; | 7808 | return rv; |
@@ -7923,6 +7947,7 @@ void md_ack_all_badblocks(struct badblocks *bb) | |||
7923 | p[i] = BB_MAKE(start, len, 1); | 7947 | p[i] = BB_MAKE(start, len, 1); |
7924 | } | 7948 | } |
7925 | } | 7949 | } |
7950 | bb->unacked_exist = 0; | ||
7926 | } | 7951 | } |
7927 | write_sequnlock_irq(&bb->lock); | 7952 | write_sequnlock_irq(&bb->lock); |
7928 | } | 7953 | } |
@@ -7970,6 +7995,8 @@ retry: | |||
7970 | (unsigned long long)s << bb->shift, | 7995 | (unsigned long long)s << bb->shift, |
7971 | length << bb->shift); | 7996 | length << bb->shift); |
7972 | } | 7997 | } |
7998 | if (unack && len == 0) | ||
7999 | bb->unacked_exist = 0; | ||
7973 | 8000 | ||
7974 | if (read_seqretry(&bb->lock, seq)) | 8001 | if (read_seqretry(&bb->lock, seq)) |
7975 | goto retry; | 8002 | goto retry; |