diff options
author | NeilBrown <neilb@suse.de> | 2011-07-27 21:31:48 -0400 |
---|---|---|
committer | NeilBrown <neilb@suse.de> | 2011-07-27 21:31:48 -0400 |
commit | 1f68f0c4b677ccd6935ff61e4e6888787505f8dc (patch) | |
tree | aaff73efbc3fb7b6092eb2106e142e1684c4b554 /drivers/md/raid1.c | |
parent | 0b7d83865cb7a60b1768212c1e60b8fd7c280506 (diff) |
md/raid1: avoid writing to known-bad blocks on known-bad drives.
If we have seen any write error on a drive, then don't write to
any known-bad blocks on that drive.
If necessary, we divide the write request up into pieces just
like we do for reads, so each piece is either all written or
all not written to any given drive.
Signed-off-by: NeilBrown <neilb@suse.de>
Reviewed-by: Namhyung Kim <namhyung@gmail.com>
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 153 |
1 files changed, 115 insertions, 38 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4d40d9d54a20..3214606204d2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -764,7 +764,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
764 | mirror_info_t *mirror; | 764 | mirror_info_t *mirror; |
765 | r1bio_t *r1_bio; | 765 | r1bio_t *r1_bio; |
766 | struct bio *read_bio; | 766 | struct bio *read_bio; |
767 | int i, targets = 0, disks; | 767 | int i, disks; |
768 | struct bitmap *bitmap; | 768 | struct bitmap *bitmap; |
769 | unsigned long flags; | 769 | unsigned long flags; |
770 | const int rw = bio_data_dir(bio); | 770 | const int rw = bio_data_dir(bio); |
@@ -772,6 +772,9 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
772 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 772 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
773 | mdk_rdev_t *blocked_rdev; | 773 | mdk_rdev_t *blocked_rdev; |
774 | int plugged; | 774 | int plugged; |
775 | int first_clone; | ||
776 | int sectors_handled; | ||
777 | int max_sectors; | ||
775 | 778 | ||
776 | /* | 779 | /* |
777 | * Register the new request and wait if the reconstruction | 780 | * Register the new request and wait if the reconstruction |
@@ -832,7 +835,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
832 | /* | 835 | /* |
833 | * read balancing logic: | 836 | * read balancing logic: |
834 | */ | 837 | */ |
835 | int max_sectors; | ||
836 | int rdisk; | 838 | int rdisk; |
837 | 839 | ||
838 | read_again: | 840 | read_again: |
@@ -872,7 +874,6 @@ read_again: | |||
872 | /* could not read all from this device, so we will | 874 | /* could not read all from this device, so we will |
873 | * need another r1_bio. | 875 | * need another r1_bio. |
874 | */ | 876 | */ |
875 | int sectors_handled; | ||
876 | 877 | ||
877 | sectors_handled = (r1_bio->sector + max_sectors | 878 | sectors_handled = (r1_bio->sector + max_sectors |
878 | - bio->bi_sector); | 879 | - bio->bi_sector); |
@@ -906,9 +907,15 @@ read_again: | |||
906 | /* | 907 | /* |
907 | * WRITE: | 908 | * WRITE: |
908 | */ | 909 | */ |
909 | /* first select target devices under spinlock and | 910 | /* first select target devices under rcu_lock and |
910 | * inc refcount on their rdev. Record them by setting | 911 | * inc refcount on their rdev. Record them by setting |
911 | * bios[x] to bio | 912 | * bios[x] to bio |
913 | * If there are known/acknowledged bad blocks on any device on | ||
914 | * which we have seen a write error, we want to avoid writing those | ||
915 | * blocks. | ||
916 | * This potentially requires several writes to write around | ||
917 | * the bad blocks. Each set of writes gets it's own r1bio | ||
918 | * with a set of bios attached. | ||
912 | */ | 919 | */ |
913 | plugged = mddev_check_plugged(mddev); | 920 | plugged = mddev_check_plugged(mddev); |
914 | 921 | ||
@@ -916,6 +923,7 @@ read_again: | |||
916 | retry_write: | 923 | retry_write: |
917 | blocked_rdev = NULL; | 924 | blocked_rdev = NULL; |
918 | rcu_read_lock(); | 925 | rcu_read_lock(); |
926 | max_sectors = r1_bio->sectors; | ||
919 | for (i = 0; i < disks; i++) { | 927 | for (i = 0; i < disks; i++) { |
920 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 928 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
921 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { | 929 | if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { |
@@ -923,17 +931,56 @@ read_again: | |||
923 | blocked_rdev = rdev; | 931 | blocked_rdev = rdev; |
924 | break; | 932 | break; |
925 | } | 933 | } |
926 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 934 | r1_bio->bios[i] = NULL; |
927 | atomic_inc(&rdev->nr_pending); | 935 | if (!rdev || test_bit(Faulty, &rdev->flags)) { |
928 | if (test_bit(Faulty, &rdev->flags)) { | 936 | set_bit(R1BIO_Degraded, &r1_bio->state); |
937 | continue; | ||
938 | } | ||
939 | |||
940 | atomic_inc(&rdev->nr_pending); | ||
941 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
942 | sector_t first_bad; | ||
943 | int bad_sectors; | ||
944 | int is_bad; | ||
945 | |||
946 | is_bad = is_badblock(rdev, r1_bio->sector, | ||
947 | max_sectors, | ||
948 | &first_bad, &bad_sectors); | ||
949 | if (is_bad < 0) { | ||
950 | /* mustn't write here until the bad block is | ||
951 | * acknowledged*/ | ||
952 | set_bit(BlockedBadBlocks, &rdev->flags); | ||
953 | blocked_rdev = rdev; | ||
954 | break; | ||
955 | } | ||
956 | if (is_bad && first_bad <= r1_bio->sector) { | ||
957 | /* Cannot write here at all */ | ||
958 | bad_sectors -= (r1_bio->sector - first_bad); | ||
959 | if (bad_sectors < max_sectors) | ||
960 | /* mustn't write more than bad_sectors | ||
961 | * to other devices yet | ||
962 | */ | ||
963 | max_sectors = bad_sectors; | ||
929 | rdev_dec_pending(rdev, mddev); | 964 | rdev_dec_pending(rdev, mddev); |
930 | r1_bio->bios[i] = NULL; | 965 | /* We don't set R1BIO_Degraded as that |
931 | } else { | 966 | * only applies if the disk is |
932 | r1_bio->bios[i] = bio; | 967 | * missing, so it might be re-added, |
933 | targets++; | 968 | * and we want to know to recover this |
969 | * chunk. | ||
970 | * In this case the device is here, | ||
971 | * and the fact that this chunk is not | ||
972 | * in-sync is recorded in the bad | ||
973 | * block log | ||
974 | */ | ||
975 | continue; | ||
934 | } | 976 | } |
935 | } else | 977 | if (is_bad) { |
936 | r1_bio->bios[i] = NULL; | 978 | int good_sectors = first_bad - r1_bio->sector; |
979 | if (good_sectors < max_sectors) | ||
980 | max_sectors = good_sectors; | ||
981 | } | ||
982 | } | ||
983 | r1_bio->bios[i] = bio; | ||
937 | } | 984 | } |
938 | rcu_read_unlock(); | 985 | rcu_read_unlock(); |
939 | 986 | ||
@@ -944,48 +991,56 @@ read_again: | |||
944 | for (j = 0; j < i; j++) | 991 | for (j = 0; j < i; j++) |
945 | if (r1_bio->bios[j]) | 992 | if (r1_bio->bios[j]) |
946 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); | 993 | rdev_dec_pending(conf->mirrors[j].rdev, mddev); |
947 | 994 | r1_bio->state = 0; | |
948 | allow_barrier(conf); | 995 | allow_barrier(conf); |
949 | md_wait_for_blocked_rdev(blocked_rdev, mddev); | 996 | md_wait_for_blocked_rdev(blocked_rdev, mddev); |
950 | wait_barrier(conf); | 997 | wait_barrier(conf); |
951 | goto retry_write; | 998 | goto retry_write; |
952 | } | 999 | } |
953 | 1000 | ||
954 | if (targets < conf->raid_disks) { | 1001 | if (max_sectors < r1_bio->sectors) { |
955 | /* array is degraded, we will not clear the bitmap | 1002 | /* We are splitting this write into multiple parts, so |
956 | * on I/O completion (see raid1_end_write_request) */ | 1003 | * we need to prepare for allocating another r1_bio. |
957 | set_bit(R1BIO_Degraded, &r1_bio->state); | 1004 | */ |
1005 | r1_bio->sectors = max_sectors; | ||
1006 | spin_lock_irq(&conf->device_lock); | ||
1007 | if (bio->bi_phys_segments == 0) | ||
1008 | bio->bi_phys_segments = 2; | ||
1009 | else | ||
1010 | bio->bi_phys_segments++; | ||
1011 | spin_unlock_irq(&conf->device_lock); | ||
958 | } | 1012 | } |
959 | 1013 | sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; | |
960 | /* do behind I/O ? | ||
961 | * Not if there are too many, or cannot allocate memory, | ||
962 | * or a reader on WriteMostly is waiting for behind writes | ||
963 | * to flush */ | ||
964 | if (bitmap && | ||
965 | (atomic_read(&bitmap->behind_writes) | ||
966 | < mddev->bitmap_info.max_write_behind) && | ||
967 | !waitqueue_active(&bitmap->behind_wait)) | ||
968 | alloc_behind_pages(bio, r1_bio); | ||
969 | 1014 | ||
970 | atomic_set(&r1_bio->remaining, 1); | 1015 | atomic_set(&r1_bio->remaining, 1); |
971 | atomic_set(&r1_bio->behind_remaining, 0); | 1016 | atomic_set(&r1_bio->behind_remaining, 0); |
972 | 1017 | ||
973 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, | 1018 | first_clone = 1; |
974 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
975 | for (i = 0; i < disks; i++) { | 1019 | for (i = 0; i < disks; i++) { |
976 | struct bio *mbio; | 1020 | struct bio *mbio; |
977 | if (!r1_bio->bios[i]) | 1021 | if (!r1_bio->bios[i]) |
978 | continue; | 1022 | continue; |
979 | 1023 | ||
980 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); | 1024 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
981 | r1_bio->bios[i] = mbio; | 1025 | md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); |
982 | 1026 | ||
983 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 1027 | if (first_clone) { |
984 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 1028 | /* do behind I/O ? |
985 | mbio->bi_end_io = raid1_end_write_request; | 1029 | * Not if there are too many, or cannot |
986 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | 1030 | * allocate memory, or a reader on WriteMostly |
987 | mbio->bi_private = r1_bio; | 1031 | * is waiting for behind writes to flush */ |
988 | 1032 | if (bitmap && | |
1033 | (atomic_read(&bitmap->behind_writes) | ||
1034 | < mddev->bitmap_info.max_write_behind) && | ||
1035 | !waitqueue_active(&bitmap->behind_wait)) | ||
1036 | alloc_behind_pages(mbio, r1_bio); | ||
1037 | |||
1038 | bitmap_startwrite(bitmap, r1_bio->sector, | ||
1039 | r1_bio->sectors, | ||
1040 | test_bit(R1BIO_BehindIO, | ||
1041 | &r1_bio->state)); | ||
1042 | first_clone = 0; | ||
1043 | } | ||
989 | if (r1_bio->behind_pages) { | 1044 | if (r1_bio->behind_pages) { |
990 | struct bio_vec *bvec; | 1045 | struct bio_vec *bvec; |
991 | int j; | 1046 | int j; |
@@ -1003,6 +1058,15 @@ read_again: | |||
1003 | atomic_inc(&r1_bio->behind_remaining); | 1058 | atomic_inc(&r1_bio->behind_remaining); |
1004 | } | 1059 | } |
1005 | 1060 | ||
1061 | r1_bio->bios[i] = mbio; | ||
1062 | |||
1063 | mbio->bi_sector = (r1_bio->sector + | ||
1064 | conf->mirrors[i].rdev->data_offset); | ||
1065 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1066 | mbio->bi_end_io = raid1_end_write_request; | ||
1067 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | ||
1068 | mbio->bi_private = r1_bio; | ||
1069 | |||
1006 | atomic_inc(&r1_bio->remaining); | 1070 | atomic_inc(&r1_bio->remaining); |
1007 | spin_lock_irqsave(&conf->device_lock, flags); | 1071 | spin_lock_irqsave(&conf->device_lock, flags); |
1008 | bio_list_add(&conf->pending_bio_list, mbio); | 1072 | bio_list_add(&conf->pending_bio_list, mbio); |
@@ -1013,6 +1077,19 @@ read_again: | |||
1013 | /* In case raid1d snuck in to freeze_array */ | 1077 | /* In case raid1d snuck in to freeze_array */ |
1014 | wake_up(&conf->wait_barrier); | 1078 | wake_up(&conf->wait_barrier); |
1015 | 1079 | ||
1080 | if (sectors_handled < (bio->bi_size >> 9)) { | ||
1081 | /* We need another r1_bio. It has already been counted | ||
1082 | * in bio->bi_phys_segments | ||
1083 | */ | ||
1084 | r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); | ||
1085 | r1_bio->master_bio = bio; | ||
1086 | r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled; | ||
1087 | r1_bio->state = 0; | ||
1088 | r1_bio->mddev = mddev; | ||
1089 | r1_bio->sector = bio->bi_sector + sectors_handled; | ||
1090 | goto retry_write; | ||
1091 | } | ||
1092 | |||
1016 | if (do_sync || !bitmap || !plugged) | 1093 | if (do_sync || !bitmap || !plugged) |
1017 | md_wakeup_thread(mddev->thread); | 1094 | md_wakeup_thread(mddev->thread); |
1018 | 1095 | ||