aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-07-27 21:39:22 -0400
committerNeilBrown <neilb@suse.de>2011-07-27 21:39:22 -0400
commit31c176ecdf3563140e6395249eda51a18130d9f6 (patch)
tree5ad7dba363214e9d36fa921c221316d2597078ef /drivers
parent62096bce231b3760882ed91205fc84682d6b0529 (diff)
md/raid5: avoid reading from known bad blocks.
There are two times that we might read in raid5: 1/ when a read request fits within a chunk on a single working device. In this case, if there is any bad block in the range of the read, we simply fail the cache-bypass read and perform the read though the stripe cache. 2/ when reading into the stripe cache. In this case we mark as failed any device which has a bad block in that strip (1 page wide). Note that we will both avoid reading and avoid writing. This is correct (as we will never read from the block, there is no point writing), but not optimal (as writing could 'fix' the error) - that will be addressed later. If we have not seen any write errors on the device yet, we treat a bad block like a recent read error. This will encourage an attempt to fix the read error which will either generate a write error, or will ensure good data is stored there. We don't yet forget the bad block in that case. That comes later. Now that we honour bad blocks when reading we can allow devices with bad blocks into the array. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/raid5.c46
1 files changed, 32 insertions, 14 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 304389ba5e27..a2d68389ee75 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2923,6 +2923,9 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
2923 spin_lock_irq(&conf->device_lock); 2923 spin_lock_irq(&conf->device_lock);
2924 for (i=disks; i--; ) { 2924 for (i=disks; i--; ) {
2925 mdk_rdev_t *rdev; 2925 mdk_rdev_t *rdev;
2926 sector_t first_bad;
2927 int bad_sectors;
2928 int is_bad = 0;
2926 2929
2927 dev = &sh->dev[i]; 2930 dev = &sh->dev[i];
2928 2931
@@ -2959,15 +2962,32 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
2959 if (dev->written) 2962 if (dev->written)
2960 s->written++; 2963 s->written++;
2961 rdev = rcu_dereference(conf->disks[i].rdev); 2964 rdev = rcu_dereference(conf->disks[i].rdev);
2962 if (s->blocked_rdev == NULL && 2965 if (rdev) {
2963 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 2966 is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS,
2964 s->blocked_rdev = rdev; 2967 &first_bad, &bad_sectors);
2965 atomic_inc(&rdev->nr_pending); 2968 if (s->blocked_rdev == NULL
2969 && (test_bit(Blocked, &rdev->flags)
2970 || is_bad < 0)) {
2971 if (is_bad < 0)
2972 set_bit(BlockedBadBlocks,
2973 &rdev->flags);
2974 s->blocked_rdev = rdev;
2975 atomic_inc(&rdev->nr_pending);
2976 }
2966 } 2977 }
2967 clear_bit(R5_Insync, &dev->flags); 2978 clear_bit(R5_Insync, &dev->flags);
2968 if (!rdev) 2979 if (!rdev)
2969 /* Not in-sync */; 2980 /* Not in-sync */;
2970 else if (test_bit(In_sync, &rdev->flags)) 2981 else if (is_bad) {
2982 /* also not in-sync */
2983 if (!test_bit(WriteErrorSeen, &rdev->flags)) {
2984 /* treat as in-sync, but with a read error
2985 * which we can now try to correct
2986 */
2987 set_bit(R5_Insync, &dev->flags);
2988 set_bit(R5_ReadError, &dev->flags);
2989 }
2990 } else if (test_bit(In_sync, &rdev->flags))
2971 set_bit(R5_Insync, &dev->flags); 2991 set_bit(R5_Insync, &dev->flags);
2972 else { 2992 else {
2973 /* in sync if before recovery_offset */ 2993 /* in sync if before recovery_offset */
@@ -3471,6 +3491,9 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3471 rcu_read_lock(); 3491 rcu_read_lock();
3472 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3492 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3473 if (rdev && test_bit(In_sync, &rdev->flags)) { 3493 if (rdev && test_bit(In_sync, &rdev->flags)) {
3494 sector_t first_bad;
3495 int bad_sectors;
3496
3474 atomic_inc(&rdev->nr_pending); 3497 atomic_inc(&rdev->nr_pending);
3475 rcu_read_unlock(); 3498 rcu_read_unlock();
3476 raid_bio->bi_next = (void*)rdev; 3499 raid_bio->bi_next = (void*)rdev;
@@ -3478,8 +3501,10 @@ static int chunk_aligned_read(mddev_t *mddev, struct bio * raid_bio)
3478 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3501 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3479 align_bi->bi_sector += rdev->data_offset; 3502 align_bi->bi_sector += rdev->data_offset;
3480 3503
3481 if (!bio_fits_rdev(align_bi)) { 3504 if (!bio_fits_rdev(align_bi) ||
3482 /* too big in some way */ 3505 is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
3506 &first_bad, &bad_sectors)) {
3507 /* too big in some way, or has a known bad block */
3483 bio_put(align_bi); 3508 bio_put(align_bi);
3484 rdev_dec_pending(rdev, mddev); 3509 rdev_dec_pending(rdev, mddev);
3485 return 0; 3510 return 0;
@@ -4671,10 +4696,6 @@ static int run(mddev_t *mddev)
4671 * 0 for a fully functional array, 1 or 2 for a degraded array. 4696 * 0 for a fully functional array, 1 or 2 for a degraded array.
4672 */ 4697 */
4673 list_for_each_entry(rdev, &mddev->disks, same_set) { 4698 list_for_each_entry(rdev, &mddev->disks, same_set) {
4674 if (rdev->badblocks.count) {
4675 printk(KERN_ERR "md/raid5: cannot handle bad blocks yet\n");
4676 goto abort;
4677 }
4678 if (rdev->raid_disk < 0) 4699 if (rdev->raid_disk < 0)
4679 continue; 4700 continue;
4680 if (test_bit(In_sync, &rdev->flags)) { 4701 if (test_bit(In_sync, &rdev->flags)) {
@@ -4983,9 +5004,6 @@ static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
4983 int first = 0; 5004 int first = 0;
4984 int last = conf->raid_disks - 1; 5005 int last = conf->raid_disks - 1;
4985 5006
4986 if (rdev->badblocks.count)
4987 return -EINVAL;
4988
4989 if (has_failed(conf)) 5007 if (has_failed(conf))
4990 /* no point adding a device */ 5008 /* no point adding a device */
4991 return -EINVAL; 5009 return -EINVAL;