aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-07-27 21:31:48 -0400
committerNeilBrown <neilb@suse.de>2011-07-27 21:31:48 -0400
commitde393cdea66cbd63c90725663f400c76faf1b255 (patch)
tree6a2bf37bee98bf7de42856f904bd23c81e082f8e /drivers/md/md.c
parentd7a9d443bc8a75a24873c0506f50051edfedc714 (diff)
md: make it easier to wait for bad blocks to be acknowledged.
It is only safe to choose not to write to a bad block if that bad block is safely recorded in metadata - i.e. if it has been 'acknowledged'. If it hasn't we need to wait for the acknowledgement. We support that using rdev->blocked wait and md_wait_for_blocked_rdev by introducing a new device flag 'BlockedBadBlock'. This flag is only advisory. It is cleared whenever we acknowledge a bad block, so that a waiter can re-check the particular bad blocks that it is interested it. It should be set by a caller when they find they need to wait. This (set after test) is inherently racy, but as md_wait_for_blocked_rdev already has a timeout, losing the race will have minimal impact. When we clear "Blocked" was also clear "BlockedBadBlocks" incase it was set incorrectly (see above race). We also modify the way we manage 'Blocked' to fit better with the new handling of 'BlockedBadBlocks' and to make it consistent between externally managed and internally managed metadata. This requires that each raidXd loop checks if the metadata needs to be written and triggers a write (md_check_recovery) if needed. Otherwise a queued write request might cause raidXd to wait for the metadata to write, and only that thread can write it. Before writing metadata, we set FaultRecorded for all devices that are Faulty, then after writing the metadata we clear Blocked for any device for which the Fault was certainly Recorded. The 'faulty' device flag now appears in sysfs if the device is faulty *or* it has unacknowledged bad blocks. So user-space which does not understand bad blocks can continue to function correctly. User space which does, should not assume a device is faulty until it sees the 'faulty' flag, and then sees the list of unacknowledged bad blocks is empty. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c77
1 files changed, 52 insertions, 25 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1520d18c5af5..a6b6471da2bc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2341,8 +2341,18 @@ repeat:
2341 if (!mddev->persistent) { 2341 if (!mddev->persistent) {
2342 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2342 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2343 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2343 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2344 if (!mddev->external) 2344 if (!mddev->external) {
2345 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2345 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2346 list_for_each_entry(rdev, &mddev->disks, same_set) {
2347 if (rdev->badblocks.changed) {
2348 md_ack_all_badblocks(&rdev->badblocks);
2349 md_error(mddev, rdev);
2350 }
2351 clear_bit(Blocked, &rdev->flags);
2352 clear_bit(BlockedBadBlocks, &rdev->flags);
2353 wake_up(&rdev->blocked_wait);
2354 }
2355 }
2346 wake_up(&mddev->sb_wait); 2356 wake_up(&mddev->sb_wait);
2347 return; 2357 return;
2348 } 2358 }
@@ -2399,9 +2409,12 @@ repeat:
2399 mddev->events --; 2409 mddev->events --;
2400 } 2410 }
2401 2411
2402 list_for_each_entry(rdev, &mddev->disks, same_set) 2412 list_for_each_entry(rdev, &mddev->disks, same_set) {
2403 if (rdev->badblocks.changed) 2413 if (rdev->badblocks.changed)
2404 any_badblocks_changed++; 2414 any_badblocks_changed++;
2415 if (test_bit(Faulty, &rdev->flags))
2416 set_bit(FaultRecorded, &rdev->flags);
2417 }
2405 2418
2406 sync_sbs(mddev, nospares); 2419 sync_sbs(mddev, nospares);
2407 spin_unlock_irq(&mddev->write_lock); 2420 spin_unlock_irq(&mddev->write_lock);
@@ -2458,9 +2471,15 @@ repeat:
2458 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2471 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2459 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2472 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2460 2473
2461 if (any_badblocks_changed) 2474 list_for_each_entry(rdev, &mddev->disks, same_set) {
2462 list_for_each_entry(rdev, &mddev->disks, same_set) 2475 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2476 clear_bit(Blocked, &rdev->flags);
2477
2478 if (any_badblocks_changed)
2463 md_ack_all_badblocks(&rdev->badblocks); 2479 md_ack_all_badblocks(&rdev->badblocks);
2480 clear_bit(BlockedBadBlocks, &rdev->flags);
2481 wake_up(&rdev->blocked_wait);
2482 }
2464} 2483}
2465 2484
2466/* words written to sysfs files may, or may not, be \n terminated. 2485/* words written to sysfs files may, or may not, be \n terminated.
@@ -2495,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2495 char *sep = ""; 2514 char *sep = "";
2496 size_t len = 0; 2515 size_t len = 0;
2497 2516
2498 if (test_bit(Faulty, &rdev->flags)) { 2517 if (test_bit(Faulty, &rdev->flags) ||
2518 rdev->badblocks.unacked_exist) {
2499 len+= sprintf(page+len, "%sfaulty",sep); 2519 len+= sprintf(page+len, "%sfaulty",sep);
2500 sep = ","; 2520 sep = ",";
2501 } 2521 }
@@ -2507,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2507 len += sprintf(page+len, "%swrite_mostly",sep); 2527 len += sprintf(page+len, "%swrite_mostly",sep);
2508 sep = ","; 2528 sep = ",";
2509 } 2529 }
2510 if (test_bit(Blocked, &rdev->flags)) { 2530 if (test_bit(Blocked, &rdev->flags) ||
2531 rdev->badblocks.unacked_exist) {
2511 len += sprintf(page+len, "%sblocked", sep); 2532 len += sprintf(page+len, "%sblocked", sep);
2512 sep = ","; 2533 sep = ",";
2513 } 2534 }
@@ -2527,12 +2548,12 @@ static ssize_t
2527state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2548state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2528{ 2549{
2529 /* can write 2550 /* can write
2530 * faulty - simulates and error 2551 * faulty - simulates an error
2531 * remove - disconnects the device 2552 * remove - disconnects the device
2532 * writemostly - sets write_mostly 2553 * writemostly - sets write_mostly
2533 * -writemostly - clears write_mostly 2554 * -writemostly - clears write_mostly
2534 * blocked - sets the Blocked flag 2555 * blocked - sets the Blocked flags
2535 * -blocked - clears the Blocked flag 2556 * -blocked - clears the Blocked and possibly simulates an error
2536 * insync - sets Insync providing device isn't active 2557 * insync - sets Insync providing device isn't active
2537 * write_error - sets WriteErrorSeen 2558 * write_error - sets WriteErrorSeen
2538 * -write_error - clears WriteErrorSeen 2559 * -write_error - clears WriteErrorSeen
@@ -2562,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2562 set_bit(Blocked, &rdev->flags); 2583 set_bit(Blocked, &rdev->flags);
2563 err = 0; 2584 err = 0;
2564 } else if (cmd_match(buf, "-blocked")) { 2585 } else if (cmd_match(buf, "-blocked")) {
2586 if (!test_bit(Faulty, &rdev->flags) &&
2587 test_bit(BlockedBadBlocks, &rdev->flags)) {
2588 /* metadata handler doesn't understand badblocks,
2589 * so we need to fail the device
2590 */
2591 md_error(rdev->mddev, rdev);
2592 }
2565 clear_bit(Blocked, &rdev->flags); 2593 clear_bit(Blocked, &rdev->flags);
2594 clear_bit(BlockedBadBlocks, &rdev->flags);
2566 wake_up(&rdev->blocked_wait); 2595 wake_up(&rdev->blocked_wait);
2567 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2596 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2568 md_wakeup_thread(rdev->mddev->thread); 2597 md_wakeup_thread(rdev->mddev->thread);
@@ -2881,7 +2910,11 @@ static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
2881} 2910}
2882static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) 2911static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2883{ 2912{
2884 return badblocks_store(&rdev->badblocks, page, len, 0); 2913 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2914 /* Maybe that ack was all we needed */
2915 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2916 wake_up(&rdev->blocked_wait);
2917 return rv;
2885} 2918}
2886static struct rdev_sysfs_entry rdev_bad_blocks = 2919static struct rdev_sysfs_entry rdev_bad_blocks =
2887__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 2920__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
@@ -6398,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6398 if (!rdev || test_bit(Faulty, &rdev->flags)) 6431 if (!rdev || test_bit(Faulty, &rdev->flags))
6399 return; 6432 return;
6400 6433
6401 if (mddev->external) 6434 if (!mddev->pers || !mddev->pers->error_handler)
6402 set_bit(Blocked, &rdev->flags);
6403/*
6404 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
6405 mdname(mddev),
6406 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
6407 __builtin_return_address(0),__builtin_return_address(1),
6408 __builtin_return_address(2),__builtin_return_address(3));
6409*/
6410 if (!mddev->pers)
6411 return;
6412 if (!mddev->pers->error_handler)
6413 return; 6435 return;
6414 mddev->pers->error_handler(mddev,rdev); 6436 mddev->pers->error_handler(mddev,rdev);
6415 if (mddev->degraded) 6437 if (mddev->degraded)
@@ -7286,8 +7308,7 @@ static int remove_and_add_spares(mddev_t *mddev)
7286 list_for_each_entry(rdev, &mddev->disks, same_set) { 7308 list_for_each_entry(rdev, &mddev->disks, same_set) {
7287 if (rdev->raid_disk >= 0 && 7309 if (rdev->raid_disk >= 0 &&
7288 !test_bit(In_sync, &rdev->flags) && 7310 !test_bit(In_sync, &rdev->flags) &&
7289 !test_bit(Faulty, &rdev->flags) && 7311 !test_bit(Faulty, &rdev->flags))
7290 !test_bit(Blocked, &rdev->flags))
7291 spares++; 7312 spares++;
7292 if (rdev->raid_disk < 0 7313 if (rdev->raid_disk < 0
7293 && !test_bit(Faulty, &rdev->flags)) { 7314 && !test_bit(Faulty, &rdev->flags)) {
@@ -7533,7 +7554,8 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7533{ 7554{
7534 sysfs_notify_dirent_safe(rdev->sysfs_state); 7555 sysfs_notify_dirent_safe(rdev->sysfs_state);
7535 wait_event_timeout(rdev->blocked_wait, 7556 wait_event_timeout(rdev->blocked_wait,
7536 !test_bit(Blocked, &rdev->flags), 7557 !test_bit(Blocked, &rdev->flags) &&
7558 !test_bit(BlockedBadBlocks, &rdev->flags),
7537 msecs_to_jiffies(5000)); 7559 msecs_to_jiffies(5000));
7538 rdev_dec_pending(rdev, mddev); 7560 rdev_dec_pending(rdev, mddev);
7539} 7561}
@@ -7779,6 +7801,8 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7779 } 7801 }
7780 7802
7781 bb->changed = 1; 7803 bb->changed = 1;
7804 if (!acknowledged)
7805 bb->unacked_exist = 1;
7782 write_sequnlock_irq(&bb->lock); 7806 write_sequnlock_irq(&bb->lock);
7783 7807
7784 return rv; 7808 return rv;
@@ -7923,6 +7947,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
7923 p[i] = BB_MAKE(start, len, 1); 7947 p[i] = BB_MAKE(start, len, 1);
7924 } 7948 }
7925 } 7949 }
7950 bb->unacked_exist = 0;
7926 } 7951 }
7927 write_sequnlock_irq(&bb->lock); 7952 write_sequnlock_irq(&bb->lock);
7928} 7953}
@@ -7970,6 +7995,8 @@ retry:
7970 (unsigned long long)s << bb->shift, 7995 (unsigned long long)s << bb->shift,
7971 length << bb->shift); 7996 length << bb->shift);
7972 } 7997 }
7998 if (unack && len == 0)
7999 bb->unacked_exist = 0;
7973 8000
7974 if (read_seqretry(&bb->lock, seq)) 8001 if (read_seqretry(&bb->lock, seq))
7975 goto retry; 8002 goto retry;