aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorNeilBrown <neilb@suse.de>2011-07-27 21:31:48 -0400
committerNeilBrown <neilb@suse.de>2011-07-27 21:31:48 -0400
commit1f68f0c4b677ccd6935ff61e4e6888787505f8dc (patch)
treeaaff73efbc3fb7b6092eb2106e142e1684c4b554 /drivers
parent0b7d83865cb7a60b1768212c1e60b8fd7c280506 (diff)
md/raid1: avoid writing to known-bad blocks on known-bad drives.
If we have seen any write error on a drive, then don't write to any known-bad blocks on that drive. If necessary, we divide the write request up into pieces just like we do for reads, so each piece is either all written or all not written to any given drive. Signed-off-by: NeilBrown <neilb@suse.de> Reviewed-by: Namhyung Kim <namhyung@gmail.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/md/raid1.c153
1 files changed, 115 insertions, 38 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4d40d9d54a20..3214606204d2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -764,7 +764,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
764 mirror_info_t *mirror; 764 mirror_info_t *mirror;
765 r1bio_t *r1_bio; 765 r1bio_t *r1_bio;
766 struct bio *read_bio; 766 struct bio *read_bio;
767 int i, targets = 0, disks; 767 int i, disks;
768 struct bitmap *bitmap; 768 struct bitmap *bitmap;
769 unsigned long flags; 769 unsigned long flags;
770 const int rw = bio_data_dir(bio); 770 const int rw = bio_data_dir(bio);
@@ -772,6 +772,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
772 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); 772 const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
773 mdk_rdev_t *blocked_rdev; 773 mdk_rdev_t *blocked_rdev;
774 int plugged; 774 int plugged;
775 int first_clone;
776 int sectors_handled;
777 int max_sectors;
775 778
776 /* 779 /*
777 * Register the new request and wait if the reconstruction 780 * Register the new request and wait if the reconstruction
@@ -832,7 +835,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
832 /* 835 /*
833 * read balancing logic: 836 * read balancing logic:
834 */ 837 */
835 int max_sectors;
836 int rdisk; 838 int rdisk;
837 839
838read_again: 840read_again:
@@ -872,7 +874,6 @@ read_again:
872 /* could not read all from this device, so we will 874 /* could not read all from this device, so we will
873 * need another r1_bio. 875 * need another r1_bio.
874 */ 876 */
875 int sectors_handled;
876 877
877 sectors_handled = (r1_bio->sector + max_sectors 878 sectors_handled = (r1_bio->sector + max_sectors
878 - bio->bi_sector); 879 - bio->bi_sector);
@@ -906,9 +907,15 @@ read_again:
906 /* 907 /*
907 * WRITE: 908 * WRITE:
908 */ 909 */
909 /* first select target devices under spinlock and 910 /* first select target devices under rcu_lock and
910 * inc refcount on their rdev. Record them by setting 911 * inc refcount on their rdev. Record them by setting
911 * bios[x] to bio 912 * bios[x] to bio
913 * If there are known/acknowledged bad blocks on any device on
914 * which we have seen a write error, we want to avoid writing those
915 * blocks.
916 * This potentially requires several writes to write around
917 * the bad blocks. Each set of writes gets it's own r1bio
918 * with a set of bios attached.
912 */ 919 */
913 plugged = mddev_check_plugged(mddev); 920 plugged = mddev_check_plugged(mddev);
914 921
@@ -916,6 +923,7 @@ read_again:
916 retry_write: 923 retry_write:
917 blocked_rdev = NULL; 924 blocked_rdev = NULL;
918 rcu_read_lock(); 925 rcu_read_lock();
926 max_sectors = r1_bio->sectors;
919 for (i = 0; i < disks; i++) { 927 for (i = 0; i < disks; i++) {
920 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 928 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
921 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 929 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -923,17 +931,56 @@ read_again:
923 blocked_rdev = rdev; 931 blocked_rdev = rdev;
924 break; 932 break;
925 } 933 }
926 if (rdev && !test_bit(Faulty, &rdev->flags)) { 934 r1_bio->bios[i] = NULL;
927 atomic_inc(&rdev->nr_pending); 935 if (!rdev || test_bit(Faulty, &rdev->flags)) {
928 if (test_bit(Faulty, &rdev->flags)) { 936 set_bit(R1BIO_Degraded, &r1_bio->state);
937 continue;
938 }
939
940 atomic_inc(&rdev->nr_pending);
941 if (test_bit(WriteErrorSeen, &rdev->flags)) {
942 sector_t first_bad;
943 int bad_sectors;
944 int is_bad;
945
946 is_bad = is_badblock(rdev, r1_bio->sector,
947 max_sectors,
948 &first_bad, &bad_sectors);
949 if (is_bad < 0) {
950 /* mustn't write here until the bad block is
951 * acknowledged*/
952 set_bit(BlockedBadBlocks, &rdev->flags);
953 blocked_rdev = rdev;
954 break;
955 }
956 if (is_bad && first_bad <= r1_bio->sector) {
957 /* Cannot write here at all */
958 bad_sectors -= (r1_bio->sector - first_bad);
959 if (bad_sectors < max_sectors)
960 /* mustn't write more than bad_sectors
961 * to other devices yet
962 */
963 max_sectors = bad_sectors;
929 rdev_dec_pending(rdev, mddev); 964 rdev_dec_pending(rdev, mddev);
930 r1_bio->bios[i] = NULL; 965 /* We don't set R1BIO_Degraded as that
931 } else { 966 * only applies if the disk is
932 r1_bio->bios[i] = bio; 967 * missing, so it might be re-added,
933 targets++; 968 * and we want to know to recover this
969 * chunk.
970 * In this case the device is here,
971 * and the fact that this chunk is not
972 * in-sync is recorded in the bad
973 * block log
974 */
975 continue;
934 } 976 }
935 } else 977 if (is_bad) {
936 r1_bio->bios[i] = NULL; 978 int good_sectors = first_bad - r1_bio->sector;
979 if (good_sectors < max_sectors)
980 max_sectors = good_sectors;
981 }
982 }
983 r1_bio->bios[i] = bio;
937 } 984 }
938 rcu_read_unlock(); 985 rcu_read_unlock();
939 986
@@ -944,48 +991,56 @@ read_again:
944 for (j = 0; j < i; j++) 991 for (j = 0; j < i; j++)
945 if (r1_bio->bios[j]) 992 if (r1_bio->bios[j])
946 rdev_dec_pending(conf->mirrors[j].rdev, mddev); 993 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
947 994 r1_bio->state = 0;
948 allow_barrier(conf); 995 allow_barrier(conf);
949 md_wait_for_blocked_rdev(blocked_rdev, mddev); 996 md_wait_for_blocked_rdev(blocked_rdev, mddev);
950 wait_barrier(conf); 997 wait_barrier(conf);
951 goto retry_write; 998 goto retry_write;
952 } 999 }
953 1000
954 if (targets < conf->raid_disks) { 1001 if (max_sectors < r1_bio->sectors) {
955 /* array is degraded, we will not clear the bitmap 1002 /* We are splitting this write into multiple parts, so
956 * on I/O completion (see raid1_end_write_request) */ 1003 * we need to prepare for allocating another r1_bio.
957 set_bit(R1BIO_Degraded, &r1_bio->state); 1004 */
1005 r1_bio->sectors = max_sectors;
1006 spin_lock_irq(&conf->device_lock);
1007 if (bio->bi_phys_segments == 0)
1008 bio->bi_phys_segments = 2;
1009 else
1010 bio->bi_phys_segments++;
1011 spin_unlock_irq(&conf->device_lock);
958 } 1012 }
959 1013 sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
960 /* do behind I/O ?
961 * Not if there are too many, or cannot allocate memory,
962 * or a reader on WriteMostly is waiting for behind writes
963 * to flush */
964 if (bitmap &&
965 (atomic_read(&bitmap->behind_writes)
966 < mddev->bitmap_info.max_write_behind) &&
967 !waitqueue_active(&bitmap->behind_wait))
968 alloc_behind_pages(bio, r1_bio);
969 1014
970 atomic_set(&r1_bio->remaining, 1); 1015 atomic_set(&r1_bio->remaining, 1);
971 atomic_set(&r1_bio->behind_remaining, 0); 1016 atomic_set(&r1_bio->behind_remaining, 0);
972 1017
973 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 1018 first_clone = 1;
974 test_bit(R1BIO_BehindIO, &r1_bio->state));
975 for (i = 0; i < disks; i++) { 1019 for (i = 0; i < disks; i++) {
976 struct bio *mbio; 1020 struct bio *mbio;
977 if (!r1_bio->bios[i]) 1021 if (!r1_bio->bios[i])
978 continue; 1022 continue;
979 1023
980 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1024 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
981 r1_bio->bios[i] = mbio; 1025 md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
982 1026
983 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 1027 if (first_clone) {
984 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1028 /* do behind I/O ?
985 mbio->bi_end_io = raid1_end_write_request; 1029 * Not if there are too many, or cannot
986 mbio->bi_rw = WRITE | do_flush_fua | do_sync; 1030 * allocate memory, or a reader on WriteMostly
987 mbio->bi_private = r1_bio; 1031 * is waiting for behind writes to flush */
988 1032 if (bitmap &&
1033 (atomic_read(&bitmap->behind_writes)
1034 < mddev->bitmap_info.max_write_behind) &&
1035 !waitqueue_active(&bitmap->behind_wait))
1036 alloc_behind_pages(mbio, r1_bio);
1037
1038 bitmap_startwrite(bitmap, r1_bio->sector,
1039 r1_bio->sectors,
1040 test_bit(R1BIO_BehindIO,
1041 &r1_bio->state));
1042 first_clone = 0;
1043 }
989 if (r1_bio->behind_pages) { 1044 if (r1_bio->behind_pages) {
990 struct bio_vec *bvec; 1045 struct bio_vec *bvec;
991 int j; 1046 int j;
@@ -1003,6 +1058,15 @@ read_again:
1003 atomic_inc(&r1_bio->behind_remaining); 1058 atomic_inc(&r1_bio->behind_remaining);
1004 } 1059 }
1005 1060
1061 r1_bio->bios[i] = mbio;
1062
1063 mbio->bi_sector = (r1_bio->sector +
1064 conf->mirrors[i].rdev->data_offset);
1065 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1066 mbio->bi_end_io = raid1_end_write_request;
1067 mbio->bi_rw = WRITE | do_flush_fua | do_sync;
1068 mbio->bi_private = r1_bio;
1069
1006 atomic_inc(&r1_bio->remaining); 1070 atomic_inc(&r1_bio->remaining);
1007 spin_lock_irqsave(&conf->device_lock, flags); 1071 spin_lock_irqsave(&conf->device_lock, flags);
1008 bio_list_add(&conf->pending_bio_list, mbio); 1072 bio_list_add(&conf->pending_bio_list, mbio);
@@ -1013,6 +1077,19 @@ read_again:
1013 /* In case raid1d snuck in to freeze_array */ 1077 /* In case raid1d snuck in to freeze_array */
1014 wake_up(&conf->wait_barrier); 1078 wake_up(&conf->wait_barrier);
1015 1079
1080 if (sectors_handled < (bio->bi_size >> 9)) {
1081 /* We need another r1_bio. It has already been counted
1082 * in bio->bi_phys_segments
1083 */
1084 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1085 r1_bio->master_bio = bio;
1086 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1087 r1_bio->state = 0;
1088 r1_bio->mddev = mddev;
1089 r1_bio->sector = bio->bi_sector + sectors_handled;
1090 goto retry_write;
1091 }
1092
1016 if (do_sync || !bitmap || !plugged) 1093 if (do_sync || !bitmap || !plugged)
1017 md_wakeup_thread(mddev->thread); 1094 md_wakeup_thread(mddev->thread);
1018 1095