diff options
author | NeilBrown <neilb@suse.de> | 2011-07-27 21:39:24 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-07-27 21:39:24 -0400 |
commit | d4432c23be957ff061f7b23fd60e8506cb472a55 (patch) | |
tree | 1e9e8cbc75721e62300bb2251658de870bf79d8a /drivers/md/raid10.c | |
parent | e875ecea266a543e643b19e44cf472f1412708f9 (diff) |
md/raid10: avoid writing to known bad blocks on known bad drives.
Writing to known bad blocks on drives that have seen a write error
is asking for trouble. So try to avoid these blocks.
Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 105 |
1 files changed, 93 insertions, 12 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index de6089926273..13077a3fd7d2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -807,6 +807,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
807 | unsigned long flags; | 807 | unsigned long flags; |
808 | mdk_rdev_t *blocked_rdev; | 808 | mdk_rdev_t *blocked_rdev; |
809 | int plugged; | 809 | int plugged; |
810 | int sectors_handled; | ||
811 | int max_sectors; | ||
810 | 812 | ||
811 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { | 813 | if (unlikely(bio->bi_rw & REQ_FLUSH)) { |
812 | md_flush_request(mddev, bio); | 814 | md_flush_request(mddev, bio); |
@@ -895,7 +897,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
895 | /* | 897 | /* |
896 | * read balancing logic: | 898 | * read balancing logic: |
897 | */ | 899 | */ |
898 | int max_sectors; | ||
899 | int disk; | 900 | int disk; |
900 | int slot; | 901 | int slot; |
901 | 902 | ||
@@ -925,8 +926,6 @@ read_again: | |||
925 | /* Could not read all from this device, so we will | 926 | /* Could not read all from this device, so we will |
926 | * need another r10_bio. | 927 | * need another r10_bio. |
927 | */ | 928 | */ |
928 | int sectors_handled; | ||
929 | |||
930 | sectors_handled = (r10_bio->sectors + max_sectors | 929 | sectors_handled = (r10_bio->sectors + max_sectors |
931 | - bio->bi_sector); | 930 | - bio->bi_sector); |
932 | r10_bio->sectors = max_sectors; | 931 | r10_bio->sectors = max_sectors; |
@@ -963,13 +962,22 @@ read_again: | |||
963 | /* first select target devices under rcu_lock and | 962 | /* first select target devices under rcu_lock and |
964 | * inc refcount on their rdev. Record them by setting | 963 | * inc refcount on their rdev. Record them by setting |
965 | * bios[x] to bio | 964 | * bios[x] to bio |
965 | * If there are known/acknowledged bad blocks on any device | ||
966 | * on which we have seen a write error, we want to avoid | ||
967 | * writing to those blocks. This potentially requires several | ||
968 | * writes to write around the bad blocks. Each set of writes | ||
969 | * gets its own r10_bio with a set of bios attached. The number | ||
970 | * of r10_bios is recored in bio->bi_phys_segments just as with | ||
971 | * the read case. | ||
966 | */ | 972 | */ |
967 | plugged = mddev_check_plugged(mddev); | 973 | plugged = mddev_check_plugged(mddev); |
968 | 974 | ||
969 | raid10_find_phys(conf, r10_bio); | 975 | raid10_find_phys(conf, r10_bio); |
970 | retry_write: | 976 | retry_write: |
971 | blocked_rdev = NULL; | 977 | blocked_rdev = NULL; |
972 | rcu_read_lock(); | 978 | rcu_read_lock(); |
979 | max_sectors = r10_bio->sectors; | ||
980 | |||
973 | for (i = 0; i < conf->copies; i++) { | 981 | for (i = 0; i < conf->copies; i++) { |
974 | int d = r10_bio->devs[i].devnum; | 982 | int d = r10_bio->devs[i].devnum; |
975 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); | 983 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev); |
@@ -978,13 +986,55 @@ read_again: | |||
978 | blocked_rdev = rdev; | 986 | blocked_rdev = rdev; |
979 | break; | 987 | break; |
980 | } | 988 | } |
981 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 989 | r10_bio->devs[i].bio = NULL; |
982 | atomic_inc(&rdev->nr_pending); | 990 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
983 | r10_bio->devs[i].bio = bio; | ||
984 | } else { | ||
985 | r10_bio->devs[i].bio = NULL; | ||
986 | set_bit(R10BIO_Degraded, &r10_bio->state); | 991 | set_bit(R10BIO_Degraded, &r10_bio->state); |
992 | continue; | ||
993 | } | ||
994 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
995 | sector_t first_bad; | ||
996 | sector_t dev_sector = r10_bio->devs[i].addr; | ||
997 | int bad_sectors; | ||
998 | int is_bad; | ||
999 | |||
1000 | is_bad = is_badblock(rdev, dev_sector, | ||
1001 | max_sectors, | ||
1002 | &first_bad, &bad_sectors); | ||
1003 | if (is_bad < 0) { | ||
1004 | /* Mustn't write here until the bad block | ||
1005 | * is acknowledged | ||
1006 | */ | ||
1007 | atomic_inc(&rdev->nr_pending); | ||
1008 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
1009 | blocked_rdev = rdev; | ||
1010 | break; | ||
1011 | } | ||
1012 | if (is_bad && first_bad <= dev_sector) { | ||
1013 | /* Cannot write here at all */ | ||
1014 | bad_sectors -= (dev_sector - first_bad); | ||
1015 | if (bad_sectors < max_sectors) | ||
1016 | /* Mustn't write more than bad_sectors | ||
1017 | * to other devices yet | ||
1018 | */ | ||
1019 | max_sectors = bad_sectors; | ||
1020 | /* We don't set R10BIO_Degraded as that | ||
1021 | * only applies if the disk is missing, | ||
1022 | * so it might be re-added, and we want to | ||
1023 | * know to recover this chunk. | ||
1024 | * In this case the device is here, and the | ||
1025 | * fact that this chunk is not in-sync is | ||
1026 | * recorded in the bad block log. | ||
1027 | */ | ||
1028 | continue; | ||
1029 | } | ||
1030 | if (is_bad) { | ||
1031 | int good_sectors = first_bad - dev_sector; | ||
1032 | if (good_sectors < max_sectors) | ||
1033 | max_sectors = good_sectors; | ||
1034 | } | ||
987 | } | 1035 | } |
1036 | r10_bio->devs[i].bio = bio; | ||
1037 | atomic_inc(&rdev->nr_pending); | ||
988 | } | 1038 | } |
989 | rcu_read_unlock(); | 1039 | rcu_read_unlock(); |
990 | 1040 | ||
@@ -1004,8 +1054,22 @@ read_again: | |||
1004 | goto retry_write; | 1054 | goto retry_write; |
1005 | } | 1055 | } |
1006 | 1056 | ||
1057 | if (max_sectors < r10_bio->sectors) { | ||
1058 | /* We are splitting this into multiple parts, so | ||
1059 | * we need to prepare for allocating another r10_bio. | ||
1060 | */ | ||
1061 | r10_bio->sectors = max_sectors; | ||
1062 | spin_lock_irq(&conf->device_lock); | ||
1063 | if (bio->bi_phys_segments == 0) | ||
1064 | bio->bi_phys_segments = 2; | ||
1065 | else | ||
1066 | bio->bi_phys_segments++; | ||
1067 | spin_unlock_irq(&conf->device_lock); | ||
1068 | } | ||
1069 | sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; | ||
1070 | |||
1007 | atomic_set(&r10_bio->remaining, 1); | 1071 | atomic_set(&r10_bio->remaining, 1); |
1008 | bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0); | 1072 | bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); |
1009 | 1073 | ||
1010 | for (i = 0; i < conf->copies; i++) { | 1074 | for (i = 0; i < conf->copies; i++) { |
1011 | struct bio *mbio; | 1075 | struct bio *mbio; |
@@ -1014,10 +1078,12 @@ read_again: | |||
1014 | continue; | 1078 | continue; |
1015 | 1079 | ||
1016 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1080 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
1081 | md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, | ||
1082 | max_sectors); | ||
1017 | r10_bio->devs[i].bio = mbio; | 1083 | r10_bio->devs[i].bio = mbio; |
1018 | 1084 | ||
1019 | mbio->bi_sector = r10_bio->devs[i].addr+ | 1085 | mbio->bi_sector = (r10_bio->devs[i].addr+ |
1020 | conf->mirrors[d].rdev->data_offset; | 1086 | conf->mirrors[d].rdev->data_offset); |
1021 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; | 1087 | mbio->bi_bdev = conf->mirrors[d].rdev->bdev; |
1022 | mbio->bi_end_io = raid10_end_write_request; | 1088 | mbio->bi_end_io = raid10_end_write_request; |
1023 | mbio->bi_rw = WRITE | do_sync | do_fua; | 1089 | mbio->bi_rw = WRITE | do_sync | do_fua; |
@@ -1042,6 +1108,21 @@ read_again: | |||
1042 | /* In case raid10d snuck in to freeze_array */ | 1108 | /* In case raid10d snuck in to freeze_array */ |
1043 | wake_up(&conf->wait_barrier); | 1109 | wake_up(&conf->wait_barrier); |
1044 | 1110 | ||
1111 | if (sectors_handled < (bio->bi_size >> 9)) { | ||
1112 | /* We need another r1_bio. It has already been counted | ||
1113 | * in bio->bi_phys_segments. | ||
1114 | */ | ||
1115 | r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); | ||
1116 | |||
1117 | r10_bio->master_bio = bio; | ||
1118 | r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
1119 | |||
1120 | r10_bio->mddev = mddev; | ||
1121 | r10_bio->sector = bio->bi_sector + sectors_handled; | ||
1122 | r10_bio->state = 0; | ||
1123 | goto retry_write; | ||
1124 | } | ||
1125 | |||
1045 | if (do_sync || !mddev->bitmap || !plugged) | 1126 | if (do_sync || !mddev->bitmap || !plugged) |
1046 | md_wakeup_thread(mddev->thread); | 1127 | md_wakeup_thread(mddev->thread); |
1047 | return 0; | 1128 | return 0; |