aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-07-27 21:39:24 -0400
committerNeilBrown <neilb@suse.de>2011-07-27 21:39:24 -0400
commitd4432c23be957ff061f7b23fd60e8506cb472a55 (patch)
tree1e9e8cbc75721e62300bb2251658de870bf79d8a /drivers/md
parente875ecea266a543e643b19e44cf472f1412708f9 (diff)
md/raid10: avoid writing to known bad blocks on known bad drives.
Writing to known bad blocks on drives that have seen a write error is asking for trouble. So try to avoid these blocks. Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/raid10.c105
1 files changed, 93 insertions, 12 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index de6089926273..13077a3fd7d2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -807,6 +807,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
807 unsigned long flags; 807 unsigned long flags;
808 mdk_rdev_t *blocked_rdev; 808 mdk_rdev_t *blocked_rdev;
809 int plugged; 809 int plugged;
810 int sectors_handled;
811 int max_sectors;
810 812
811 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 813 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
812 md_flush_request(mddev, bio); 814 md_flush_request(mddev, bio);
@@ -895,7 +897,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
895 /* 897 /*
896 * read balancing logic: 898 * read balancing logic:
897 */ 899 */
898 int max_sectors;
899 int disk; 900 int disk;
900 int slot; 901 int slot;
901 902
@@ -925,8 +926,6 @@ read_again:
925 /* Could not read all from this device, so we will 926 /* Could not read all from this device, so we will
926 * need another r10_bio. 927 * need another r10_bio.
927 */ 928 */
928 int sectors_handled;
929
930 sectors_handled = (r10_bio->sectors + max_sectors 929 sectors_handled = (r10_bio->sectors + max_sectors
931 - bio->bi_sector); 930 - bio->bi_sector);
932 r10_bio->sectors = max_sectors; 931 r10_bio->sectors = max_sectors;
@@ -963,13 +962,22 @@ read_again:
963 /* first select target devices under rcu_lock and 962 /* first select target devices under rcu_lock and
964 * inc refcount on their rdev. Record them by setting 963 * inc refcount on their rdev. Record them by setting
965 * bios[x] to bio 964 * bios[x] to bio
965 * If there are known/acknowledged bad blocks on any device
966 * on which we have seen a write error, we want to avoid
967 * writing to those blocks. This potentially requires several
968 * writes to write around the bad blocks. Each set of writes
969 * gets its own r10_bio with a set of bios attached. The number
970 * of r10_bios is recored in bio->bi_phys_segments just as with
971 * the read case.
966 */ 972 */
967 plugged = mddev_check_plugged(mddev); 973 plugged = mddev_check_plugged(mddev);
968 974
969 raid10_find_phys(conf, r10_bio); 975 raid10_find_phys(conf, r10_bio);
970 retry_write: 976retry_write:
971 blocked_rdev = NULL; 977 blocked_rdev = NULL;
972 rcu_read_lock(); 978 rcu_read_lock();
979 max_sectors = r10_bio->sectors;
980
973 for (i = 0; i < conf->copies; i++) { 981 for (i = 0; i < conf->copies; i++) {
974 int d = r10_bio->devs[i].devnum; 982 int d = r10_bio->devs[i].devnum;
975 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); 983 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -978,13 +986,55 @@ read_again:
978 blocked_rdev = rdev; 986 blocked_rdev = rdev;
979 break; 987 break;
980 } 988 }
981 if (rdev && !test_bit(Faulty, &rdev->flags)) { 989 r10_bio->devs[i].bio = NULL;
982 atomic_inc(&rdev->nr_pending); 990 if (!rdev || test_bit(Faulty, &rdev->flags)) {
983 r10_bio->devs[i].bio = bio;
984 } else {
985 r10_bio->devs[i].bio = NULL;
986 set_bit(R10BIO_Degraded, &r10_bio->state); 991 set_bit(R10BIO_Degraded, &r10_bio->state);
992 continue;
993 }
994 if (test_bit(WriteErrorSeen, &rdev->flags)) {
995 sector_t first_bad;
996 sector_t dev_sector = r10_bio->devs[i].addr;
997 int bad_sectors;
998 int is_bad;
999
1000 is_bad = is_badblock(rdev, dev_sector,
1001 max_sectors,
1002 &first_bad, &bad_sectors);
1003 if (is_bad < 0) {
1004 /* Mustn't write here until the bad block
1005 * is acknowledged
1006 */
1007 atomic_inc(&rdev->nr_pending);
1008 set_bit(BlockedBadBlocks, &rdev->flags);
1009 blocked_rdev = rdev;
1010 break;
1011 }
1012 if (is_bad && first_bad <= dev_sector) {
1013 /* Cannot write here at all */
1014 bad_sectors -= (dev_sector - first_bad);
1015 if (bad_sectors < max_sectors)
1016 /* Mustn't write more than bad_sectors
1017 * to other devices yet
1018 */
1019 max_sectors = bad_sectors;
1020 /* We don't set R10BIO_Degraded as that
1021 * only applies if the disk is missing,
1022 * so it might be re-added, and we want to
1023 * know to recover this chunk.
1024 * In this case the device is here, and the
1025 * fact that this chunk is not in-sync is
1026 * recorded in the bad block log.
1027 */
1028 continue;
1029 }
1030 if (is_bad) {
1031 int good_sectors = first_bad - dev_sector;
1032 if (good_sectors < max_sectors)
1033 max_sectors = good_sectors;
1034 }
987 } 1035 }
1036 r10_bio->devs[i].bio = bio;
1037 atomic_inc(&rdev->nr_pending);
988 } 1038 }
989 rcu_read_unlock(); 1039 rcu_read_unlock();
990 1040
@@ -1004,8 +1054,22 @@ read_again:
1004 goto retry_write; 1054 goto retry_write;
1005 } 1055 }
1006 1056
1057 if (max_sectors < r10_bio->sectors) {
1058 /* We are splitting this into multiple parts, so
1059 * we need to prepare for allocating another r10_bio.
1060 */
1061 r10_bio->sectors = max_sectors;
1062 spin_lock_irq(&conf->device_lock);
1063 if (bio->bi_phys_segments == 0)
1064 bio->bi_phys_segments = 2;
1065 else
1066 bio->bi_phys_segments++;
1067 spin_unlock_irq(&conf->device_lock);
1068 }
1069 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1070
1007 atomic_set(&r10_bio->remaining, 1); 1071 atomic_set(&r10_bio->remaining, 1);
1008 bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); 1072 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1009 1073
1010 for (i = 0; i < conf->copies; i++) { 1074 for (i = 0; i < conf->copies; i++) {
1011 struct bio *mbio; 1075 struct bio *mbio;
@@ -1014,10 +1078,12 @@ read_again:
1014 continue; 1078 continue;
1015 1079
1016 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1080 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1081 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1082 max_sectors);
1017 r10_bio->devs[i].bio = mbio; 1083 r10_bio->devs[i].bio = mbio;
1018 1084
1019 mbio->bi_sector = r10_bio->devs[i].addr+ 1085 mbio->bi_sector = (r10_bio->devs[i].addr+
1020 conf->mirrors[d].rdev->data_offset; 1086 conf->mirrors[d].rdev->data_offset);
1021 mbio->bi_bdev = conf->mirrors[d].rdev->bdev; 1087 mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1022 mbio->bi_end_io = raid10_end_write_request; 1088 mbio->bi_end_io = raid10_end_write_request;
1023 mbio->bi_rw = WRITE | do_sync | do_fua; 1089 mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1042,6 +1108,21 @@ read_again:
1042 /* In case raid10d snuck in to freeze_array */ 1108 /* In case raid10d snuck in to freeze_array */
1043 wake_up(&conf->wait_barrier); 1109 wake_up(&conf->wait_barrier);
1044 1110
1111 if (sectors_handled < (bio->bi_size >> 9)) {
1112 /* We need another r1_bio. It has already been counted
1113 * in bio->bi_phys_segments.
1114 */
1115 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1116
1117 r10_bio->master_bio = bio;
1118 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1119
1120 r10_bio->mddev = mddev;
1121 r10_bio->sector = bio->bi_sector + sectors_handled;
1122 r10_bio->state = 0;
1123 goto retry_write;
1124 }
1125
1045 if (do_sync || !mddev->bitmap || !plugged) 1126 if (do_sync || !mddev->bitmap || !plugged)
1046 md_wakeup_thread(mddev->thread); 1127 md_wakeup_thread(mddev->thread);
1047 return 0; 1128 return 0;