aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-07-27 21:31:48 -0400
committerNeilBrown <neilb@suse.de>2011-07-27 21:31:48 -0400
commitde393cdea66cbd63c90725663f400c76faf1b255 (patch)
tree6a2bf37bee98bf7de42856f904bd23c81e082f8e
parentd7a9d443bc8a75a24873c0506f50051edfedc714 (diff)
md: make it easier to wait for bad blocks to be acknowledged.
It is only safe to choose not to write to a bad block if that bad block is safely recorded in metadata - i.e. if it has been 'acknowledged'. If it hasn't we need to wait for the acknowledgement. We support that using rdev->blocked wait and md_wait_for_blocked_rdev by introducing a new device flag 'BlockedBadBlock'. This flag is only advisory. It is cleared whenever we acknowledge a bad block, so that a waiter can re-check the particular bad blocks that it is interested it. It should be set by a caller when they find they need to wait. This (set after test) is inherently racy, but as md_wait_for_blocked_rdev already has a timeout, losing the race will have minimal impact. When we clear "Blocked" was also clear "BlockedBadBlocks" incase it was set incorrectly (see above race). We also modify the way we manage 'Blocked' to fit better with the new handling of 'BlockedBadBlocks' and to make it consistent between externally managed and internally managed metadata. This requires that each raidXd loop checks if the metadata needs to be written and triggers a write (md_check_recovery) if needed. Otherwise a queued write request might cause raidXd to wait for the metadata to write, and only that thread can write it. Before writing metadata, we set FaultRecorded for all devices that are Faulty, then after writing the metadata we clear Blocked for any device for which the Fault was certainly Recorded. The 'faulty' device flag now appears in sysfs if the device is faulty *or* it has unacknowledged bad blocks. So user-space which does not understand bad blocks can continue to function correctly. User space which does, should not assume a device is faulty until it sees the 'faulty' flag, and then sees the list of unacknowledged bad blocks is empty. Signed-off-by: NeilBrown <neilb@suse.de>
-rw-r--r--drivers/md/md.c77
-rw-r--r--drivers/md/md.h25
-rw-r--r--drivers/md/raid1.c3
-rw-r--r--drivers/md/raid10.c3
-rw-r--r--drivers/md/raid5.c4
5 files changed, 85 insertions, 27 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 1520d18c5af5..a6b6471da2bc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2341,8 +2341,18 @@ repeat:
2341 if (!mddev->persistent) { 2341 if (!mddev->persistent) {
2342 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2342 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2343 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2343 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2344 if (!mddev->external) 2344 if (!mddev->external) {
2345 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2345 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2346 list_for_each_entry(rdev, &mddev->disks, same_set) {
2347 if (rdev->badblocks.changed) {
2348 md_ack_all_badblocks(&rdev->badblocks);
2349 md_error(mddev, rdev);
2350 }
2351 clear_bit(Blocked, &rdev->flags);
2352 clear_bit(BlockedBadBlocks, &rdev->flags);
2353 wake_up(&rdev->blocked_wait);
2354 }
2355 }
2346 wake_up(&mddev->sb_wait); 2356 wake_up(&mddev->sb_wait);
2347 return; 2357 return;
2348 } 2358 }
@@ -2399,9 +2409,12 @@ repeat:
2399 mddev->events --; 2409 mddev->events --;
2400 } 2410 }
2401 2411
2402 list_for_each_entry(rdev, &mddev->disks, same_set) 2412 list_for_each_entry(rdev, &mddev->disks, same_set) {
2403 if (rdev->badblocks.changed) 2413 if (rdev->badblocks.changed)
2404 any_badblocks_changed++; 2414 any_badblocks_changed++;
2415 if (test_bit(Faulty, &rdev->flags))
2416 set_bit(FaultRecorded, &rdev->flags);
2417 }
2405 2418
2406 sync_sbs(mddev, nospares); 2419 sync_sbs(mddev, nospares);
2407 spin_unlock_irq(&mddev->write_lock); 2420 spin_unlock_irq(&mddev->write_lock);
@@ -2458,9 +2471,15 @@ repeat:
2458 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2471 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2459 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2472 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2460 2473
2461 if (any_badblocks_changed) 2474 list_for_each_entry(rdev, &mddev->disks, same_set) {
2462 list_for_each_entry(rdev, &mddev->disks, same_set) 2475 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2476 clear_bit(Blocked, &rdev->flags);
2477
2478 if (any_badblocks_changed)
2463 md_ack_all_badblocks(&rdev->badblocks); 2479 md_ack_all_badblocks(&rdev->badblocks);
2480 clear_bit(BlockedBadBlocks, &rdev->flags);
2481 wake_up(&rdev->blocked_wait);
2482 }
2464} 2483}
2465 2484
2466/* words written to sysfs files may, or may not, be \n terminated. 2485/* words written to sysfs files may, or may not, be \n terminated.
@@ -2495,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2495 char *sep = ""; 2514 char *sep = "";
2496 size_t len = 0; 2515 size_t len = 0;
2497 2516
2498 if (test_bit(Faulty, &rdev->flags)) { 2517 if (test_bit(Faulty, &rdev->flags) ||
2518 rdev->badblocks.unacked_exist) {
2499 len+= sprintf(page+len, "%sfaulty",sep); 2519 len+= sprintf(page+len, "%sfaulty",sep);
2500 sep = ","; 2520 sep = ",";
2501 } 2521 }
@@ -2507,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2507 len += sprintf(page+len, "%swrite_mostly",sep); 2527 len += sprintf(page+len, "%swrite_mostly",sep);
2508 sep = ","; 2528 sep = ",";
2509 } 2529 }
2510 if (test_bit(Blocked, &rdev->flags)) { 2530 if (test_bit(Blocked, &rdev->flags) ||
2531 rdev->badblocks.unacked_exist) {
2511 len += sprintf(page+len, "%sblocked", sep); 2532 len += sprintf(page+len, "%sblocked", sep);
2512 sep = ","; 2533 sep = ",";
2513 } 2534 }
@@ -2527,12 +2548,12 @@ static ssize_t
2527state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2548state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2528{ 2549{
2529 /* can write 2550 /* can write
2530 * faulty - simulates and error 2551 * faulty - simulates an error
2531 * remove - disconnects the device 2552 * remove - disconnects the device
2532 * writemostly - sets write_mostly 2553 * writemostly - sets write_mostly
2533 * -writemostly - clears write_mostly 2554 * -writemostly - clears write_mostly
2534 * blocked - sets the Blocked flag 2555 * blocked - sets the Blocked flags
2535 * -blocked - clears the Blocked flag 2556 * -blocked - clears the Blocked and possibly simulates an error
2536 * insync - sets Insync providing device isn't active 2557 * insync - sets Insync providing device isn't active
2537 * write_error - sets WriteErrorSeen 2558 * write_error - sets WriteErrorSeen
2538 * -write_error - clears WriteErrorSeen 2559 * -write_error - clears WriteErrorSeen
@@ -2562,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2562 set_bit(Blocked, &rdev->flags); 2583 set_bit(Blocked, &rdev->flags);
2563 err = 0; 2584 err = 0;
2564 } else if (cmd_match(buf, "-blocked")) { 2585 } else if (cmd_match(buf, "-blocked")) {
2586 if (!test_bit(Faulty, &rdev->flags) &&
2587 test_bit(BlockedBadBlocks, &rdev->flags)) {
2588 /* metadata handler doesn't understand badblocks,
2589 * so we need to fail the device
2590 */
2591 md_error(rdev->mddev, rdev);
2592 }
2565 clear_bit(Blocked, &rdev->flags); 2593 clear_bit(Blocked, &rdev->flags);
2594 clear_bit(BlockedBadBlocks, &rdev->flags);
2566 wake_up(&rdev->blocked_wait); 2595 wake_up(&rdev->blocked_wait);
2567 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2596 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2568 md_wakeup_thread(rdev->mddev->thread); 2597 md_wakeup_thread(rdev->mddev->thread);
@@ -2881,7 +2910,11 @@ static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
2881} 2910}
2882static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) 2911static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2883{ 2912{
2884 return badblocks_store(&rdev->badblocks, page, len, 0); 2913 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2914 /* Maybe that ack was all we needed */
2915 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2916 wake_up(&rdev->blocked_wait);
2917 return rv;
2885} 2918}
2886static struct rdev_sysfs_entry rdev_bad_blocks = 2919static struct rdev_sysfs_entry rdev_bad_blocks =
2887__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 2920__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
@@ -6398,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6398 if (!rdev || test_bit(Faulty, &rdev->flags)) 6431 if (!rdev || test_bit(Faulty, &rdev->flags))
6399 return; 6432 return;
6400 6433
6401 if (mddev->external) 6434 if (!mddev->pers || !mddev->pers->error_handler)
6402 set_bit(Blocked, &rdev->flags);
6403/*
6404 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
6405 mdname(mddev),
6406 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
6407 __builtin_return_address(0),__builtin_return_address(1),
6408 __builtin_return_address(2),__builtin_return_address(3));
6409*/
6410 if (!mddev->pers)
6411 return;
6412 if (!mddev->pers->error_handler)
6413 return; 6435 return;
6414 mddev->pers->error_handler(mddev,rdev); 6436 mddev->pers->error_handler(mddev,rdev);
6415 if (mddev->degraded) 6437 if (mddev->degraded)
@@ -7286,8 +7308,7 @@ static int remove_and_add_spares(mddev_t *mddev)
7286 list_for_each_entry(rdev, &mddev->disks, same_set) { 7308 list_for_each_entry(rdev, &mddev->disks, same_set) {
7287 if (rdev->raid_disk >= 0 && 7309 if (rdev->raid_disk >= 0 &&
7288 !test_bit(In_sync, &rdev->flags) && 7310 !test_bit(In_sync, &rdev->flags) &&
7289 !test_bit(Faulty, &rdev->flags) && 7311 !test_bit(Faulty, &rdev->flags))
7290 !test_bit(Blocked, &rdev->flags))
7291 spares++; 7312 spares++;
7292 if (rdev->raid_disk < 0 7313 if (rdev->raid_disk < 0
7293 && !test_bit(Faulty, &rdev->flags)) { 7314 && !test_bit(Faulty, &rdev->flags)) {
@@ -7533,7 +7554,8 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7533{ 7554{
7534 sysfs_notify_dirent_safe(rdev->sysfs_state); 7555 sysfs_notify_dirent_safe(rdev->sysfs_state);
7535 wait_event_timeout(rdev->blocked_wait, 7556 wait_event_timeout(rdev->blocked_wait,
7536 !test_bit(Blocked, &rdev->flags), 7557 !test_bit(Blocked, &rdev->flags) &&
7558 !test_bit(BlockedBadBlocks, &rdev->flags),
7537 msecs_to_jiffies(5000)); 7559 msecs_to_jiffies(5000));
7538 rdev_dec_pending(rdev, mddev); 7560 rdev_dec_pending(rdev, mddev);
7539} 7561}
@@ -7779,6 +7801,8 @@ static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7779 } 7801 }
7780 7802
7781 bb->changed = 1; 7803 bb->changed = 1;
7804 if (!acknowledged)
7805 bb->unacked_exist = 1;
7782 write_sequnlock_irq(&bb->lock); 7806 write_sequnlock_irq(&bb->lock);
7783 7807
7784 return rv; 7808 return rv;
@@ -7923,6 +7947,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
7923 p[i] = BB_MAKE(start, len, 1); 7947 p[i] = BB_MAKE(start, len, 1);
7924 } 7948 }
7925 } 7949 }
7950 bb->unacked_exist = 0;
7926 } 7951 }
7927 write_sequnlock_irq(&bb->lock); 7952 write_sequnlock_irq(&bb->lock);
7928} 7953}
@@ -7970,6 +7995,8 @@ retry:
7970 (unsigned long long)s << bb->shift, 7995 (unsigned long long)s << bb->shift,
7971 length << bb->shift); 7996 length << bb->shift);
7972 } 7997 }
7998 if (unack && len == 0)
7999 bb->unacked_exist = 0;
7973 8000
7974 if (read_seqretry(&bb->lock, seq)) 8001 if (read_seqretry(&bb->lock, seq))
7975 goto retry; 8002 goto retry;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index fa4b607854ac..1e586bb4452e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -81,12 +81,29 @@ struct mdk_rdev_s
81#define In_sync 2 /* device is in_sync with rest of array */ 81#define In_sync 2 /* device is in_sync with rest of array */
82#define WriteMostly 4 /* Avoid reading if at all possible */ 82#define WriteMostly 4 /* Avoid reading if at all possible */
83#define AutoDetected 7 /* added by auto-detect */ 83#define AutoDetected 7 /* added by auto-detect */
84#define Blocked 8 /* An error occurred on an externally 84#define Blocked 8 /* An error occurred but has not yet
85 * managed array, don't allow writes 85 * been acknowledged by the metadata
86 * handler, so don't allow writes
86 * until it is cleared */ 87 * until it is cleared */
87#define WriteErrorSeen 9 /* A write error has been seen on this 88#define WriteErrorSeen 9 /* A write error has been seen on this
88 * device 89 * device
89 */ 90 */
91#define FaultRecorded 10 /* Intermediate state for clearing
92 * Blocked. The Fault is/will-be
93 * recorded in the metadata, but that
94 * metadata hasn't been stored safely
95 * on disk yet.
96 */
97#define BlockedBadBlocks 11 /* A writer is blocked because they
98 * found an unacknowledged bad-block.
99 * This can safely be cleared at any
100 * time, and the writer will re-check.
101 * It may be set at any time, and at
102 * worst the writer will timeout and
103 * re-check. So setting it as
104 * accurately as possible is good, but
105 * not absolutely critical.
106 */
90 wait_queue_head_t blocked_wait; 107 wait_queue_head_t blocked_wait;
91 108
92 int desc_nr; /* descriptor index in the superblock */ 109 int desc_nr; /* descriptor index in the superblock */
@@ -124,6 +141,10 @@ struct mdk_rdev_s
124 141
125 struct badblocks { 142 struct badblocks {
126 int count; /* count of bad blocks */ 143 int count; /* count of bad blocks */
144 int unacked_exist; /* there probably are unacknowledged
145 * bad blocks. This is only cleared
146 * when a read discovers none
147 */
127 int shift; /* shift from sectors to block size 148 int shift; /* shift from sectors to block size
128 * a -ve shift means badblocks are 149 * a -ve shift means badblocks are
129 * disabled.*/ 150 * disabled.*/
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8c31c39b6f8c..4d40d9d54a20 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1059,6 +1059,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1059 conf->recovery_disabled = mddev->recovery_disabled; 1059 conf->recovery_disabled = mddev->recovery_disabled;
1060 return; 1060 return;
1061 } 1061 }
1062 set_bit(Blocked, &rdev->flags);
1062 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1063 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1063 unsigned long flags; 1064 unsigned long flags;
1064 spin_lock_irqsave(&conf->device_lock, flags); 1065 spin_lock_irqsave(&conf->device_lock, flags);
@@ -1751,6 +1752,8 @@ read_more:
1751 generic_make_request(r1_bio->bios[r1_bio->read_disk]); 1752 generic_make_request(r1_bio->bios[r1_bio->read_disk]);
1752 } 1753 }
1753 cond_resched(); 1754 cond_resched();
1755 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
1756 md_check_recovery(mddev);
1754 } 1757 }
1755 blk_finish_plug(&plug); 1758 blk_finish_plug(&plug);
1756} 1759}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 8aadd2f52dc8..fe6692e62215 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1021,6 +1021,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1021 */ 1021 */
1022 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1022 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1023 } 1023 }
1024 set_bit(Blocked, &rdev->flags);
1024 set_bit(Faulty, &rdev->flags); 1025 set_bit(Faulty, &rdev->flags);
1025 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1026 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1026 printk(KERN_ALERT 1027 printk(KERN_ALERT
@@ -1703,6 +1704,8 @@ static void raid10d(mddev_t *mddev)
1703 } 1704 }
1704 } 1705 }
1705 cond_resched(); 1706 cond_resched();
1707 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
1708 md_check_recovery(mddev);
1706 } 1709 }
1707 blk_finish_plug(&plug); 1710 blk_finish_plug(&plug);
1708} 1711}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 719445004dd9..304389ba5e27 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1706,6 +1706,7 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1706 */ 1706 */
1707 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1707 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1708 } 1708 }
1709 set_bit(Blocked, &rdev->flags);
1709 set_bit(Faulty, &rdev->flags); 1710 set_bit(Faulty, &rdev->flags);
1710 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1711 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1711 printk(KERN_ALERT 1712 printk(KERN_ALERT
@@ -4143,6 +4144,9 @@ static void raid5d(mddev_t *mddev)
4143 release_stripe(sh); 4144 release_stripe(sh);
4144 cond_resched(); 4145 cond_resched();
4145 4146
4147 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
4148 md_check_recovery(mddev);
4149
4146 spin_lock_irq(&conf->device_lock); 4150 spin_lock_irq(&conf->device_lock);
4147 } 4151 }
4148 pr_debug("%d stripes handled\n", handled); 4152 pr_debug("%d stripes handled\n", handled);